Spaces:

Justinrune
/

LLaMA-Factory

Running

App Files Files Community

LLaMA-Factory / tests /model /model_utils /test_attention.py

Justinrune

Upload folder using huggingface_hub

2852136 verified 6 months ago

raw

history blame

1.81 kB

	# Copyright 2024 the LlamaFactory team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import os

	from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_available

	from llamafactory.hparams import get_infer_args
	from llamafactory.model import load_model, load_tokenizer


	TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")

	INFER_ARGS = {
	"model_name_or_path": TINY_LLAMA,
	"template": "llama3",
	}


	def test_attention():
	attention_available = ["off"]
	if is_torch_sdpa_available():
	attention_available.append("sdpa")

	if is_flash_attn_2_available():
	attention_available.append("fa2")

	llama_attention_classes = {
	"off": "LlamaAttention",
	"sdpa": "LlamaSdpaAttention",
	"fa2": "LlamaFlashAttention2",
	}
	for requested_attention in attention_available:
	model_args, _, finetuning_args, _ = get_infer_args({"flash_attn": requested_attention, **INFER_ARGS})
	tokenizer_module = load_tokenizer(model_args)
	model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args)
	for module in model.modules():
	if "Attention" in module.__class__.__name__:
	assert module.__class__.__name__ == llama_attention_classes[requested_attention]