Problem loading with vllm
#8
by
artishock
- opened
I tried loading your model using vllm, and get this erorr
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 119, in from_engine_args
return cls(ipc_path=ipc_path,
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 71, in __init__
self.engine = LLMEngine(*args, **kwargs)
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 347, in __init__
self.model_executor = executor_class(vllm_config=vllm_config, )
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/distributed_gpu_executor.py", line 26, in __init__
super().__init__(*args, **kwargs)
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 36, in __init__
self._init_executor()
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/multiproc_gpu_executor.py", line 114, in _init_executor
self._run_workers("load_model",
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/multiproc_gpu_executor.py", line 195, in _run_workers
driver_worker_output = driver_worker_method(*args, **kwargs)
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/worker/worker.py", line 152, in load_model
self.model_runner.load_model()
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1074, in load_model
self.model = get_model(vllm_config=self.vllm_config)
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py", line 12, in get_model
return loader.load_model(vllm_config=vllm_config)
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 334, in load_model
model.load_weights(self._get_all_weights(model_config, model))
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 586, in load_weights
loader.load_weights(
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 229, in load_weights
autoloaded_weights = list(self._load_module("", self.module, weights))
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 190, in _load_module
yield from self._load_module(prefix,
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 175, in _load_module
module_load_weights(weights)
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 410, in load_weights
weight_loader(param, loaded_weight)
File "/home/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/layers/linear.py", line 1053, in weight_loader
assert param_data.shape == loaded_weight.shape
AssertionError```
same problem
kens_details=False, dispatch_function=<function serve at 0x7f67d7dbf9a0>)
INFO 11-23 08:20:59 api_server.py:175] Multiprocessing frontend to use ipc:///tmp/4c311651-9726-4619-91af-7ee567134302 for IPC Path.
INFO 11-23 08:20:59 api_server.py:194] Started engine process with PID 8789
INFO 11-23 08:21:05 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.
WARNING 11-23 08:21:05 config.py:428] bitsandbytes quantization is not fully optimized yet. The speed can be slower than non-quantized models.
WARNING 11-23 08:21:05 arg_utils.py:1075] [DEPRECATED] Block manager v1 has been removed, and setting --use-v2-block-manager to True or False has no effect on vLLM behavior. Please remove --use-v2-block-manager in your engine argument. If you
r use case is not supported by SelfAttnBlockSpaceManager (i.e. block manager v2), please file an issue with detailed information.
INFO 11-23 08:21:10 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
WARNING 11-23 08:21:10 config.py:428] bitsandbytes quantization is not fully optimized yet. The speed can be slower than non-quantized models.
WARNING 11-23 08:21:10 arg_utils.py:1075] [DEPRECATED] Block manager v1 has been removed, and setting --use-v2-block-manager to True or False has no effect on vLLM behavior. Please remove --use-v2-block-manager in your engine argument. If you
r use case is not supported by SelfAttnBlockSpaceManager (i.e. block manager v2), please file an issue with detailed information.
INFO 11-23 08:21:10 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit', speculative_config=None, tokenizer='unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit', ski
p_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=LoadFormat.AUTO, tensor_paralle
l_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend
='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit, num_scheduler_s
teps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=True, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)
INFO 11-23 08:21:10 selector.py:135] Using Flash Attention backend.
INFO 11-23 08:21:11 model_runner.py:1072] Starting to load model unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit...
INFO 11-23 08:21:11 weight_utils.py:243] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards: 0% Completed | 0/8 [00:00<?, ?it/s]
ERROR 11-23 08:21:12 engine.py:366] 'layers.74.mlp.down_proj.weight.absmax'
ERROR 11-23 08:21:12 engine.py:366] Traceback (most recent call last):
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 357, in run_mp_engine
ERROR 11-23 08:21:12 engine.py:366] engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 119, in from_engine_args
ERROR 11-23 08:21:12 engine.py:366] return cls(ipc_path=ipc_path,
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 71, in __init__
ERROR 11-23 08:21:12 engine.py:366] self.engine = LLMEngine(*args, **kwargs)
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 347, in __init__
ERROR 11-23 08:21:12 engine.py:366] self.model_executor = executor_class(vllm_config=vllm_config, )
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 36, in __init__
ERROR 11-23 08:21:12 engine.py:366] self._init_executor()
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 40, in _init_executor
ERROR 11-23 08:21:12 engine.py:366] self.driver_worker.load_model()
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/worker/worker.py", line 152, in load_model
ERROR 11-23 08:21:12 engine.py:366] self.model_runner.load_model()
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1074, in load_model
ERROR 11-23 08:21:12 engine.py:366] self.model = get_model(vllm_config=self.vllm_config)
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py", line 12, in get_model
ERROR 11-23 08:21:12 engine.py:366] return loader.load_model(vllm_config=vllm_config)
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 334, in load_model
ERROR 11-23 08:21:12 engine.py:366] model.load_weights(self._get_all_weights(model_config, model))
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 586, in load_weights
ERROR 11-23 08:21:12 engine.py:366] loader.load_weights(
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 229, in load_weights
ERROR 11-23 08:21:12 engine.py:366] autoloaded_weights = list(self._load_module("", self.module, weights))
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 190, in _load_module
ERROR 11-23 08:21:12 engine.py:366] yield from self._load_module(prefix,
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 175, in _load_module
ERROR 11-23 08:21:12 engine.py:366] module_load_weights(weights)
ERROR 11-23 08:21:12 engine.py:366] File "/app/venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 407, in load_weights
ERROR 11-23 08:21:12 engine.py:366] param = params_dict[name]
ERROR 11-23 08:21:12 engine.py:366] KeyError: 'layers.74.mlp.down_proj.weight.absmax'
Process SpawnProcess-1:
Traceback (most recent call last):
File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/app/venv/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 368, in run_mp_engine
raise e
File "/app/venv/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 357, in run_mp_engine
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
File "/app/venv/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 119, in from_engine_args
return cls(ipc_path=ipc_path,
File "/app/venv/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 71, in __init__
self.engine = LLMEngine(*args, **kwargs)
File "/app/venv/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 347, in __init__
lf.model_executor = executor_class(vllm_config=vllm_config, )
File "/app/venv/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 36, in __init__
lf._init_executor()
Fi "/app/venv/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 40, in _init_executor
lf.driver_worker.load_model()
File "/app/venv/lib/python3.10/site-packages/vllm/worker/worker.py", line 152, in load_model
lf.model_runner.load_model()
File "/app/venv/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1074, in load_model
lf.model = get_model(vllm_config=self.vllm_config)
Fi "/app/venv/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py", line 12, in get_model
turn loader.load_model(vllm_config=vllm_config)
Fi "/app/venv/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 334, in load_model
del.load_weights(self._get_all_weights(model_config, model))
Fi "/app/venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 586, in load_weights
ader.load_weights(
Fi "/app/venv/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 229, in load_weights
toloaded_weights = list(self._load_module("", self.module, weights))
Fi "/app/venv/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 190, in _load_module
eld from self._load_module(prefix,
Fi "/app/venv/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 175, in _load_module
dule_load_weights(weights)
Fi "/app/venv/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 407, in load_weights
param = params_dict[name]
KeyError: 'layers.74.mlp.down_proj.weight.absmax'
Loading safetensors checkpoint shards: 0% Completed | 0/8 [00:00<?, ?it/s]
[rank0]:[W1123 08:21:13.588827477 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that
any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has onl
y been added since PyTorch 2.4 (function operator())
Traceback (most recent call last):
File "/app/venv/bin/vllm", line 8, in <module>
s.exit(main())
File "/app/venv/lib/python3.10/site-packages/vllm/scripts.py", line 195, in main
gs.dispatch_function(args)
File "/app/venv/lib/python3.10/site-packages/vllm/scripts.py", line 41, in serve
loop.run(run_server(args))
File "/app/venv/lib/python3.10/site-packages/uvloop/__init__.py", line 82, in run
turn loop.run_until_complete(wrapper())
le "uvloop/loop.pyx", line 1517, in uvloop.loop.Loop.run_until_complete
File "/app/venv/lib/python3.10/site-packages/uvloop/__init__.py", line 61, in wrapper
turn await main
Fi "/app/venv/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 609, in run_server
ync with build_async_engine_client(args) as engine_client:
File "/usr/lib/python3.10/contextlib.py", line 199, in __aenter__
turn await anext(self.gen)
Fi "/app/venv/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 113, in build_async_engine_client
ync with build_async_engine_client_from_engine_args(
File "/usr/lib/python3.10/contextlib.py", line 199, in __aenter__
turn await anext(self.gen)
Fi "/app/venv/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 210, in build_async_engine_client_from_engine_args
raise RuntimeError(
RuntimeError: Engine process failed to start. See stack trace for the root cause.