| | |
| | * Adapted from |
| | * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h |
| | * Copyright (c) 2023, The vLLM team. |
| | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. |
| | * |
| | * Licensed under the Apache License, Version 2.0 (the "License"); |
| | * you may not use this file except in compliance with the License. |
| | * You may obtain a copy of the License at |
| | * |
| | * http://www.apache.org/licenses/LICENSE-2.0 |
| | * |
| | * Unless required by applicable law or agreed to in writing, software |
| | * distributed under the License is distributed on an "AS IS" BASIS, |
| | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| | * See the License for the specific language governing permissions and |
| | * limitations under the License. |
| | */ |
| | #pragma once |
| |
|
| | #include <stdint.h> |
| |
|
| | namespace vllm { |
| |
|
| | |
| | template <typename T, int VEC_SIZE> |
| | struct Vec {}; |
| |
|
| | |
| | template <typename T> |
| | struct FloatVec {}; |
| |
|
| | |
| | template <typename Acc, typename A, typename B> |
| | inline __device__ Acc mul(A a, B b); |
| |
|
| | template <typename T> |
| | inline __device__ float sum(T v); |
| |
|
| | template <typename T> |
| | inline __device__ float dot(T a, T b) { |
| | return sum(mul<T, T, T>(a, b)); |
| | } |
| |
|
| | template <typename A, typename T> |
| | inline __device__ float dot(T a, T b) { |
| | return sum(mul<A, T, T>(a, b)); |
| | } |
| |
|
| | template <typename T> |
| | inline __device__ void zero(T& dst) { |
| | constexpr int WORDS = sizeof(T) / 4; |
| | union { |
| | T raw; |
| | uint32_t words[WORDS]; |
| | } tmp; |
| |
|
| | #pragma unroll |
| | for (int ii = 0; ii < WORDS; ++ii) { |
| | tmp.words[ii] = 0u; |
| | } |
| | dst = tmp.raw; |
| | } |
| |
|
| | } |
| |
|