oldfritter commited on
Commit
3628a0b
·
1 Parent(s): 6898053

Upload 4 files

Browse files
Files changed (4) hide show
  1. config.json +51 -0
  2. configuration_qwen.py +71 -0
  3. cpp_kernels.py +55 -0
  4. generation_config.json +12 -0
config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/tg/.cache/huggingface/hub/models--Qwen--Qwen-72B-Chat/snapshots/49c7a076779b3396ad12a452babe0fe8da07f297",
3
+ "architectures": [
4
+ "QWenLMHeadModel"
5
+ ],
6
+ "attn_dropout_prob": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_qwen.QWenConfig",
9
+ "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
10
+ },
11
+ "bf16": true,
12
+ "emb_dropout_prob": 0.0,
13
+ "fp16": false,
14
+ "fp32": false,
15
+ "hidden_size": 8192,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 49152,
18
+ "kv_channels": 128,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "max_position_embeddings": 32768,
21
+ "model_type": "qwen",
22
+ "no_bias": true,
23
+ "num_attention_heads": 64,
24
+ "num_hidden_layers": 80,
25
+ "onnx_safe": null,
26
+ "quantization_config": {
27
+ "bits": 4,
28
+ "group_size": 128,
29
+ "modules_to_not_convert": null,
30
+ "quant_method": "awq",
31
+ "version": "gemm",
32
+ "zero_point": true
33
+ },
34
+ "rope_theta": 1000000,
35
+ "rotary_emb_base": 1000000,
36
+ "rotary_pct": 1.0,
37
+ "scale_attn_weights": true,
38
+ "seq_length": 32768,
39
+ "softmax_in_fp32": false,
40
+ "tie_word_embeddings": false,
41
+ "tokenizer_class": "QWenTokenizer",
42
+ "torch_dtype": "bfloat16",
43
+ "transformers_version": "4.36.0",
44
+ "use_cache": true,
45
+ "use_cache_kernel": false,
46
+ "use_cache_quantization": false,
47
+ "use_dynamic_ntk": false,
48
+ "use_flash_attn": true,
49
+ "use_logn_attn": false,
50
+ "vocab_size": 152064
51
+ }
configuration_qwen.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from transformers import PretrainedConfig
7
+
8
+
9
+ class QWenConfig(PretrainedConfig):
10
+ model_type = "qwen"
11
+ keys_to_ignore_at_inference = ["past_key_values"]
12
+
13
+ def __init__(
14
+ self,
15
+ vocab_size=151936,
16
+ hidden_size=4096,
17
+ num_hidden_layers=32,
18
+ num_attention_heads=32,
19
+ emb_dropout_prob=0.0,
20
+ attn_dropout_prob=0.0,
21
+ layer_norm_epsilon=1e-6,
22
+ initializer_range=0.02,
23
+ max_position_embeddings=8192,
24
+ scale_attn_weights=True,
25
+ use_cache=True,
26
+ bf16=False,
27
+ fp16=False,
28
+ fp32=False,
29
+ kv_channels=128,
30
+ rotary_pct=1.0,
31
+ rotary_emb_base=10000,
32
+ use_dynamic_ntk=True,
33
+ use_logn_attn=True,
34
+ use_flash_attn="auto",
35
+ intermediate_size=22016,
36
+ no_bias=True,
37
+ tie_word_embeddings=False,
38
+ use_cache_quantization=False,
39
+ use_cache_kernel=False,
40
+ softmax_in_fp32=False,
41
+ **kwargs,
42
+ ):
43
+ self.vocab_size = vocab_size
44
+ self.hidden_size = hidden_size
45
+ self.intermediate_size = intermediate_size
46
+ self.num_hidden_layers = num_hidden_layers
47
+ self.num_attention_heads = num_attention_heads
48
+ self.emb_dropout_prob = emb_dropout_prob
49
+ self.attn_dropout_prob = attn_dropout_prob
50
+ self.layer_norm_epsilon = layer_norm_epsilon
51
+ self.initializer_range = initializer_range
52
+ self.scale_attn_weights = scale_attn_weights
53
+ self.use_cache = use_cache
54
+ self.max_position_embeddings = max_position_embeddings
55
+ self.bf16 = bf16
56
+ self.fp16 = fp16
57
+ self.fp32 = fp32
58
+ self.kv_channels = kv_channels
59
+ self.rotary_pct = rotary_pct
60
+ self.rotary_emb_base = rotary_emb_base
61
+ self.use_dynamic_ntk = use_dynamic_ntk
62
+ self.use_logn_attn = use_logn_attn
63
+ self.use_flash_attn = use_flash_attn
64
+ self.no_bias = no_bias
65
+ self.use_cache_quantization = use_cache_quantization
66
+ self.use_cache_kernel = use_cache_kernel
67
+ self.softmax_in_fp32 = softmax_in_fp32
68
+ super().__init__(
69
+ tie_word_embeddings=tie_word_embeddings,
70
+ **kwargs
71
+ )
cpp_kernels.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils import cpp_extension
2
+ import pathlib
3
+ import os
4
+ import subprocess
5
+
6
+ def _get_cuda_bare_metal_version(cuda_dir):
7
+ raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
8
+ universal_newlines=True)
9
+ output = raw_output.split()
10
+ release_idx = output.index("release") + 1
11
+ release = output[release_idx].split(".")
12
+ bare_metal_major = release[0]
13
+ bare_metal_minor = release[1][0]
14
+
15
+ return raw_output, bare_metal_major, bare_metal_minor
16
+
17
+ def _create_build_dir(buildpath):
18
+ try:
19
+ os.mkdir(buildpath)
20
+ except OSError:
21
+ if not os.path.isdir(buildpath):
22
+ print(f"Creation of the build directory {buildpath} failed")
23
+
24
+ # Check if cuda 11 is installed for compute capability 8.0
25
+ cc_flag = []
26
+ _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
27
+ if int(bare_metal_major) >= 11:
28
+ cc_flag.append('-gencode')
29
+ cc_flag.append('arch=compute_80,code=sm_80')
30
+ if int(bare_metal_minor) >= 7:
31
+ cc_flag.append('-gencode')
32
+ cc_flag.append('arch=compute_90,code=sm_90')
33
+
34
+ # Build path
35
+ srcpath = pathlib.Path(__file__).parent.absolute()
36
+ buildpath = srcpath / 'build'
37
+ _create_build_dir(buildpath)
38
+
39
+ def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
40
+ return cpp_extension.load(
41
+ name=name,
42
+ sources=sources,
43
+ build_directory=buildpath,
44
+ extra_cflags=['-O3', ],
45
+ extra_cuda_cflags=['-O3',
46
+ '-gencode', 'arch=compute_70,code=sm_70',
47
+ '--use_fast_math'] + extra_cuda_flags + cc_flag,
48
+ verbose=1
49
+ )
50
+
51
+ extra_flags = []
52
+
53
+ cache_autogptq_cuda_256_sources = ["./cache_autogptq_cuda_256.cpp",
54
+ "./cache_autogptq_cuda_kernel_256.cu"]
55
+ cache_autogptq_cuda_256 = _cpp_extention_load_helper("cache_autogptq_cuda_256", cache_autogptq_cuda_256_sources, extra_flags)
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chat_format": "chatml",
3
+ "do_sample": true,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 512,
6
+ "max_window_size": 6144,
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.1,
9
+ "top_k": 0,
10
+ "top_p": 0.8,
11
+ "transformers_version": "4.36.0"
12
+ }