riczhou commited on
Commit
ff21671
1 Parent(s): 49d22b7

Initial commit

Browse files
logs.txt ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/227 [00:00<?, ?it/s]
1
 
 
2
  0%| | 0/227 [00:00<?, ?it/s]
3
 
 
4
  0%| | 0/227 [00:05<?, ?it/s]
5
 
 
6
  0%| | 0/227 [00:06<?, ?it/s]
7
 
 
8
  0%| | 0/227 [00:06<?, ?it/s]
9
  0%| | 1/227 [00:06<23:41, 6.29s/it]
10
 
 
11
  0%| | 1/227 [00:06<23:41, 6.29s/it]
12
 
 
13
  0%| | 1/227 [00:18<23:41, 6.29s/it]
14
 
 
15
  0%| | 1/227 [00:18<23:41, 6.29s/it]
16
 
 
17
  0%| | 1/227 [00:19<23:41, 6.29s/it]
18
  1%| | 2/227 [00:20<40:15, 10.74s/it]
19
 
 
20
  1%| | 2/227 [00:20<40:15, 10.74s/it]
21
 
 
22
  1%| | 2/227 [00:21<40:15, 10.74s/it]
23
 
 
24
  1%| | 2/227 [00:21<40:15, 10.74s/it]
25
  1%|▏ | 3/227 [00:22<24:57, 6.68s/it]
26
 
 
27
  1%|▏ | 3/227 [00:22<24:57, 6.68s/it]
28
 
 
29
  1%|▏ | 3/227 [00:22<24:57, 6.68s/it]
30
  2%|▏ | 5/227 [00:29<21:48, 5.89s/it]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /opt/conda/envs/py310/bin/python -m mlc_llm gen_config /models/Mixtral-8x7B-Instruct-v0.1 --quantization q4f32_1 --conv-template mistral_default --output /models/mlc-delivery/hf/mlc-ai/Mixtral-8x7B-Instruct-v0.1-q4f32_1-MLC
2
+ [2024-06-06 22:21:44] INFO auto_config.py:116: Found model configuration: /models/Mixtral-8x7B-Instruct-v0.1/config.json
3
+ [2024-06-06 22:21:44] INFO auto_config.py:154: Found model type: mixtral. Use `--model-type` to override.
4
+ [2024-06-06 22:21:44] INFO llama_model.py:52: context_window_size not found in config.json. Falling back to max_position_embeddings (32768)
5
+ [2024-06-06 22:21:44] INFO llama_model.py:72: prefill_chunk_size defaults to 2048
6
+ [2024-06-06 22:21:44] INFO config.py:107: Overriding max_batch_size from 1 to 80
7
+ [2024-06-06 22:21:44] INFO gen_config.py:143: [generation_config.json] Setting bos_token_id: 1
8
+ [2024-06-06 22:21:44] INFO gen_config.py:143: [generation_config.json] Setting eos_token_id: 2
9
+ [2024-06-06 22:21:44] INFO gen_config.py:155: Found tokenizer config: /models/Mixtral-8x7B-Instruct-v0.1/tokenizer.model. Copying to /models/mlc-delivery/hf/mlc-ai/Mixtral-8x7B-Instruct-v0.1-q4f32_1-MLC/tokenizer.model
10
+ [2024-06-06 22:21:44] INFO gen_config.py:155: Found tokenizer config: /models/Mixtral-8x7B-Instruct-v0.1/tokenizer.json. Copying to /models/mlc-delivery/hf/mlc-ai/Mixtral-8x7B-Instruct-v0.1-q4f32_1-MLC/tokenizer.json
11
+ [2024-06-06 22:21:44] INFO gen_config.py:157: Not found tokenizer config: /models/Mixtral-8x7B-Instruct-v0.1/vocab.json
12
+ [2024-06-06 22:21:44] INFO gen_config.py:157: Not found tokenizer config: /models/Mixtral-8x7B-Instruct-v0.1/merges.txt
13
+ [2024-06-06 22:21:44] INFO gen_config.py:157: Not found tokenizer config: /models/Mixtral-8x7B-Instruct-v0.1/added_tokens.json
14
+ [2024-06-06 22:21:44] INFO gen_config.py:155: Found tokenizer config: /models/Mixtral-8x7B-Instruct-v0.1/tokenizer_config.json. Copying to /models/mlc-delivery/hf/mlc-ai/Mixtral-8x7B-Instruct-v0.1-q4f32_1-MLC/tokenizer_config.json
15
+ [2024-06-06 22:21:44] INFO gen_config.py:216: Detected tokenizer info: {'token_postproc_method': 'byte_fallback', 'prepend_space_in_encode': True, 'strip_space_in_decode': True}
16
+ [2024-06-06 22:21:44] INFO gen_config.py:32: [System default] Setting pad_token_id: 0
17
+ [2024-06-06 22:21:44] INFO gen_config.py:32: [System default] Setting temperature: 1.0
18
+ [2024-06-06 22:21:44] INFO gen_config.py:32: [System default] Setting presence_penalty: 0.0
19
+ [2024-06-06 22:21:44] INFO gen_config.py:32: [System default] Setting frequency_penalty: 0.0
20
+ [2024-06-06 22:21:44] INFO gen_config.py:32: [System default] Setting repetition_penalty: 1.0
21
+ [2024-06-06 22:21:44] INFO gen_config.py:32: [System default] Setting top_p: 1.0
22
+ [2024-06-06 22:21:44] INFO gen_config.py:223: Dumping configuration file to: /models/mlc-delivery/hf/mlc-ai/Mixtral-8x7B-Instruct-v0.1-q4f32_1-MLC/mlc-chat-config.json
23
+ /opt/conda/envs/py310/bin/python -m mlc_llm convert_weight /models/Mixtral-8x7B-Instruct-v0.1 --quantization q4f32_1 --output /models/mlc-delivery/hf/mlc-ai/Mixtral-8x7B-Instruct-v0.1-q4f32_1-MLC
24
+ [2024-06-06 22:21:46] INFO auto_config.py:116: Found model configuration: /models/Mixtral-8x7B-Instruct-v0.1/config.json
25
+ [2024-06-06 22:21:47] INFO auto_device.py:79: Found device: cuda:0
26
+ [2024-06-06 22:21:49] INFO auto_device.py:88: Not found device: rocm:0
27
+ [2024-06-06 22:21:50] INFO auto_device.py:88: Not found device: metal:0
28
+ [2024-06-06 22:21:52] INFO auto_device.py:79: Found device: vulkan:0
29
+ [2024-06-06 22:21:52] INFO auto_device.py:79: Found device: vulkan:1
30
+ [2024-06-06 22:21:52] INFO auto_device.py:79: Found device: vulkan:2
31
+ [2024-06-06 22:21:52] INFO auto_device.py:79: Found device: vulkan:3
32
+ [2024-06-06 22:21:53] INFO auto_device.py:88: Not found device: opencl:0
33
+ [2024-06-06 22:21:53] INFO auto_device.py:35: Using device: cuda:0
34
+ [2024-06-06 22:21:53] INFO auto_weight.py:71: Finding weights in: /models/Mixtral-8x7B-Instruct-v0.1
35
+ [2024-06-06 22:21:53] INFO auto_weight.py:137: Not found Huggingface PyTorch
36
+ [2024-06-06 22:21:53] INFO auto_weight.py:144: Found source weight format: huggingface-safetensor. Source configuration: /models/Mixtral-8x7B-Instruct-v0.1/model.safetensors.index.json
37
+ [2024-06-06 22:21:53] INFO auto_weight.py:107: Using source weight configuration: /models/Mixtral-8x7B-Instruct-v0.1/model.safetensors.index.json. Use `--source` to override.
38
+ [2024-06-06 22:21:53] INFO auto_weight.py:111: Using source weight format: huggingface-safetensor. Use `--source-format` to override.
39
+ [2024-06-06 22:21:53] INFO auto_config.py:154: Found model type: mixtral. Use `--model-type` to override.
40
+ [2024-06-06 22:21:53] INFO llama_model.py:52: context_window_size not found in config.json. Falling back to max_position_embeddings (32768)
41
+ [2024-06-06 22:21:53] INFO llama_model.py:72: prefill_chunk_size defaults to 2048
42
+ Weight conversion with arguments:
43
+ --config /models/Mixtral-8x7B-Instruct-v0.1/config.json
44
+ --quantization GroupQuantize(name='q4f32_1', kind='group-quant', group_size=32, quantize_dtype='int4', storage_dtype='uint32', model_dtype='float32', linear_weight_layout='NK', quantize_embedding=True, quantize_final_fc=True, num_elem_per_storage=8, num_storage_per_group=4, max_int_value=7)
45
+ --model-type mixtral
46
+ --device cuda:0
47
+ --source /models/Mixtral-8x7B-Instruct-v0.1/model.safetensors.index.json
48
+ --source-format huggingface-safetensor
49
+ --output /models/mlc-delivery/hf/mlc-ai/Mixtral-8x7B-Instruct-v0.1-q4f32_1-MLC
50
+ Start storing to cache /models/mlc-delivery/hf/mlc-ai/Mixtral-8x7B-Instruct-v0.1-q4f32_1-MLC
51
+
52
  0%| | 0/227 [00:00<?, ?it/s]
53
 
54
+
55
  0%| | 0/227 [00:00<?, ?it/s]
56
 
57
+
58
  0%| | 0/227 [00:05<?, ?it/s]
59
 
60
+
61
  0%| | 0/227 [00:06<?, ?it/s]
62
 
63
+
64
  0%| | 0/227 [00:06<?, ?it/s]
65
  0%| | 1/227 [00:06<23:41, 6.29s/it]
66
 
67
+
68
  0%| | 1/227 [00:06<23:41, 6.29s/it]
69
 
70
+
71
  0%| | 1/227 [00:18<23:41, 6.29s/it]
72
 
73
+
74
  0%| | 1/227 [00:18<23:41, 6.29s/it]
75
 
76
+
77
  0%| | 1/227 [00:19<23:41, 6.29s/it]
78
  1%| | 2/227 [00:20<40:15, 10.74s/it]
79
 
80
+
81
  1%| | 2/227 [00:20<40:15, 10.74s/it]
82
 
83
+
84
  1%| | 2/227 [00:21<40:15, 10.74s/it]
85
 
86
+
87
  1%| | 2/227 [00:21<40:15, 10.74s/it]
88
  1%|▏ | 3/227 [00:22<24:57, 6.68s/it]
89
 
90
+
91
  1%|▏ | 3/227 [00:22<24:57, 6.68s/it]
92
 
93
+
94
  1%|▏ | 3/227 [00:22<24:57, 6.68s/it]
95
  2%|▏ | 5/227 [00:29<21:48, 5.89s/it]
96
+ Traceback (most recent call last):
97
+ File "/opt/conda/envs/py310/lib/python3.10/runpy.py", line 196, in _run_module_as_main
98
+ return _run_code(code, main_globals, None,
99
+ File "/opt/conda/envs/py310/lib/python3.10/runpy.py", line 86, in _run_code
100
+ exec(code, run_globals)
101
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/mlc_llm/__main__.py", line 64, in <module>
102
+ main()
103
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/mlc_llm/__main__.py", line 37, in main
104
+ cli.main(sys.argv[2:])
105
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/mlc_llm/cli/convert_weight.py", line 88, in main
106
+ convert_weight(
107
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/mlc_llm/interface/convert_weight.py", line 181, in convert_weight
108
+ _convert_args(args)
109
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/mlc_llm/interface/convert_weight.py", line 145, in _convert_args
110
+ tvmjs.dump_ndarray_cache(
111
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/tvm/contrib/tvmjs.py", line 272, in dump_ndarray_cache
112
+ for k, origin_v in param_generator:
113
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/mlc_llm/interface/convert_weight.py", line 129, in _param_generator
114
+ for name, param in loader.load(device=args.device, preshard_funcs=preshard_funcs):
115
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/mlc_llm/loader/huggingface_loader.py", line 118, in load
116
+ param = self._load_mlc_param(mlc_name, device=device)
117
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/mlc_llm/loader/huggingface_loader.py", line 157, in _load_mlc_param
118
+ return as_ndarray(param, device=device)
119
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/tvm/runtime/ndarray.py", line 675, in array
120
+ return empty(arr.shape, arr.dtype, device, mem_scope).copyfrom(arr)
121
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/tvm/runtime/ndarray.py", line 431, in empty
122
+ arr = _ffi_api.TVMArrayAllocWithScope(shape, dtype, device, mem_scope)
123
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 332, in tvm._ffi._cy3.core.PackedFuncBase.__call__
124
+ File "tvm/_ffi/_cython/./packed_func.pxi", line 277, in tvm._ffi._cy3.core.FuncCall
125
+ File "tvm/_ffi/_cython/./base.pxi", line 182, in tvm._ffi._cy3.core.CHECK_CALL
126
+ File "/opt/conda/envs/py310/lib/python3.10/site-packages/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
127
+ raise py_err
128
+ tvm.error.InternalError: Traceback (most recent call last):
129
+ 5: _ZN3tvm7runtime13PackedFun
130
+ 4: tvm::runtime::TypedPackedFunc<tvm::runtime::NDArray (tvm::runtime::ShapeTuple, DLDataType, DLDevice, tvm::runtime::Optional<tvm::runtime::String>)>::AssignTypedLambda<tvm::runtime::NDArray (*)(tvm::runtime::ShapeTuple, DLDataType, DLDevice, tvm::runtime::Optional<tvm::runtime::String>)>(tvm::runtime::NDArray (*)(tvm::runtime::ShapeTuple, DLDataType, DLDevice, tvm::runtime::Optional<tvm::runtime::String>), std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const
131
+ 3: tvm::runtime::NDArray::Empty(tvm::runtime::ShapeTuple, DLDataType, DLDevice, tvm::runtime::Optional<tvm::runtime::String>)
132
+ 2: tvm::runtime::DeviceAPI::AllocDataSpace(DLDevice, int, long const*, DLDataType, tvm::runtime::Optional<tvm::runtime::String>)
133
+ 1: tvm::runtime::CUDADeviceAPI::AllocDataSpace(DLDevice, unsigned long, unsigned long, DLDataType)
134
+ 0: _ZN3tvm7runtime6deta
135
+ File "/workspace/tvm/src/runtime/cuda/cuda_device_api.cc", line 145
136
+ InternalError: Check failed: (e == cudaSuccess || e == cudaErrorCudartUnloading) is false: CUDA: out of memory
mlc-chat-config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.1.0",
3
+ "model_type": "mixtral",
4
+ "quantization": "q4f32_1",
5
+ "model_config": {
6
+ "hidden_size": 4096,
7
+ "intermediate_size": 14336,
8
+ "num_attention_heads": 32,
9
+ "num_hidden_layers": 32,
10
+ "rms_norm_eps": 1e-05,
11
+ "vocab_size": 32000,
12
+ "position_embedding_base": 1000000.0,
13
+ "context_window_size": 32768,
14
+ "prefill_chunk_size": 2048,
15
+ "num_key_value_heads": 8,
16
+ "head_dim": 128,
17
+ "tensor_parallel_shards": 1,
18
+ "max_batch_size": 80,
19
+ "num_local_experts": 8,
20
+ "num_experts_per_tok": 2
21
+ },
22
+ "vocab_size": 32000,
23
+ "context_window_size": 32768,
24
+ "sliding_window_size": -1,
25
+ "prefill_chunk_size": 2048,
26
+ "attention_sink_size": -1,
27
+ "tensor_parallel_shards": 1,
28
+ "temperature": 1.0,
29
+ "presence_penalty": 0.0,
30
+ "frequency_penalty": 0.0,
31
+ "repetition_penalty": 1.0,
32
+ "top_p": 1.0,
33
+ "tokenizer_files": [
34
+ "tokenizer.model",
35
+ "tokenizer.json",
36
+ "tokenizer_config.json"
37
+ ],
38
+ "tokenizer_info": {
39
+ "token_postproc_method": "byte_fallback",
40
+ "prepend_space_in_encode": true,
41
+ "strip_space_in_decode": true
42
+ },
43
+ "conv_template": {
44
+ "name": "mistral_default",
45
+ "system_template": "[INST] {system_message}",
46
+ "system_message": "Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity.",
47
+ "system_prefix_token_ids": [
48
+ 1
49
+ ],
50
+ "add_role_after_system_message": false,
51
+ "roles": {
52
+ "user": "[INST]",
53
+ "assistant": "[/INST]",
54
+ "tool": "[INST]"
55
+ },
56
+ "role_templates": {
57
+ "user": "{user_message}",
58
+ "assistant": "{assistant_message}",
59
+ "tool": "{tool_message}"
60
+ },
61
+ "messages": [],
62
+ "seps": [
63
+ " "
64
+ ],
65
+ "role_content_sep": " ",
66
+ "role_empty_sep": "",
67
+ "stop_str": [
68
+ "</s>"
69
+ ],
70
+ "stop_token_ids": [
71
+ 2
72
+ ],
73
+ "function_string": "",
74
+ "use_function_calling": false
75
+ },
76
+ "pad_token_id": 0,
77
+ "bos_token_id": 1,
78
+ "eos_token_id": 2
79
+ }
params_shard_0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2139fc39441ae69ecf90ef1a4407a1e53bdea162ed6134483a07a131af0a290c
3
+ size 65536000
params_shard_1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dc50ed18a9ea66cf35d2c494df597a1bd33bd26429b4ea941bb6cc0e29e0c78
3
+ size 469762048
params_shard_2.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00b376256f2cca2233ff824f675d7e4468f41f570bc63cb0b4c02ebe19b5eaed
3
+ size 58720256
params_shard_3.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f7e75d17b23f2e874301f25e6285650fed5503e5a1d5adf755f59e98a70e777
3
+ size 234881024
params_shard_4.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:261604710e332abd50bb8434b9beda23309083d78840e2732ac17a18bdcc35a5
3
+ size 29360128
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": true,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": null,
37
+ "sp_model_kwargs": {},
38
+ "spaces_between_special_tokens": false,
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false,
42
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
43
+ }