jslin09 commited on
Commit
7bc4f04
·
verified ·
1 Parent(s): 718eb00

Upload 4 files

Browse files
Files changed (4) hide show
  1. config.json +23 -0
  2. modeling_kuixing.py +155 -0
  3. tokenizer.model +3 -0
  4. tokenizer_config.json +9 -0
config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "kuixing",
3
+ "architectures": [
4
+ "KuiXingForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "modeling_kuixing.KuiXingHFConfig",
8
+ "AutoModelForCausalLM": "modeling_kuixing.KuiXingForCausalLM"
9
+ },
10
+ "vocab_size": 99384,
11
+ "hidden_size": 2400,
12
+ "num_hidden_layers": 12,
13
+ "num_attention_heads": 32,
14
+ "intermediate_size": 9600,
15
+ "max_position_embeddings": 2048,
16
+ "hidden_act": "gelu",
17
+ "dropout": 0.1,
18
+ "pad_token_id": 0,
19
+ "bos_token_id": 2,
20
+ "eos_token_id": 3,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.40.0"
23
+ }
modeling_kuixing.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ KuiXing (魁星) — HuggingFace 相容包裝層
3
+ AutoConfig → KuiXingHFConfig
4
+ AutoModelForCausalLM → KuiXingForCausalLM
5
+
6
+ 權重以 float32 儲存於 model.safetensors。
7
+ 如需 bfloat16 推理:model = model.to(torch.bfloat16).eval()
8
+ """
9
+ import math, os
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from safetensors.torch import load_file
14
+ from transformers import PretrainedConfig, PreTrainedModel, GenerationMixin
15
+ from transformers.modeling_outputs import CausalLMOutputWithPast
16
+
17
+
18
+ class KuiXingHFConfig(PretrainedConfig):
19
+ model_type = "kuixing"
20
+ def __init__(
21
+ self,
22
+ vocab_size=99384,
23
+ hidden_size=2400,
24
+ num_hidden_layers=12,
25
+ num_attention_heads=32,
26
+ intermediate_size=9600,
27
+ max_position_embeddings=2048,
28
+ dropout=0.1,
29
+ pad_token_id=0, bos_token_id=2, eos_token_id=3,
30
+ **kwargs,
31
+ ):
32
+ super().__init__(pad_token_id=pad_token_id,
33
+ bos_token_id=bos_token_id,
34
+ eos_token_id=eos_token_id, **kwargs)
35
+ self.vocab_size = vocab_size
36
+ self.hidden_size = hidden_size
37
+ self.num_hidden_layers = num_hidden_layers
38
+ self.num_attention_heads = num_attention_heads
39
+ self.intermediate_size = intermediate_size
40
+ self.max_position_embeddings = max_position_embeddings
41
+ self.dropout = dropout
42
+
43
+
44
+ class _Attention(nn.Module):
45
+ def __init__(self, cfg):
46
+ super().__init__()
47
+ self.n_heads = cfg.num_attention_heads
48
+ self.d_head = cfg.hidden_size // cfg.num_attention_heads
49
+ self.q_proj = nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=False)
50
+ self.k_proj = nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=False)
51
+ self.v_proj = nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=False)
52
+ self.o_proj = nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=False)
53
+
54
+ def forward(self, x, mask=None):
55
+ B, L, D = x.shape
56
+ H, Dh = self.n_heads, self.d_head
57
+ q = self.q_proj(x).view(B, L, H, Dh).transpose(1, 2)
58
+ k = self.k_proj(x).view(B, L, H, Dh).transpose(1, 2)
59
+ v = self.v_proj(x).view(B, L, H, Dh).transpose(1, 2)
60
+ w = (q.float() @ k.float().transpose(-2, -1)) / math.sqrt(Dh)
61
+ if mask is not None:
62
+ w = w + mask
63
+ w = F.softmax(w, dim=-1).to(x.dtype)
64
+ out = (w.float() @ v.float()).to(x.dtype)
65
+ return self.o_proj(out.transpose(1, 2).reshape(B, L, D))
66
+
67
+
68
+ class _MLP(nn.Module):
69
+ def __init__(self, cfg):
70
+ super().__init__()
71
+ self.fc1 = nn.Linear(cfg.hidden_size, cfg.intermediate_size, bias=False)
72
+ self.act = nn.GELU()
73
+ self.fc2 = nn.Linear(cfg.intermediate_size, cfg.hidden_size, bias=False)
74
+ def forward(self, x):
75
+ return self.fc2(self.act(self.fc1(x)))
76
+
77
+
78
+ class _Block(nn.Module):
79
+ def __init__(self, cfg):
80
+ super().__init__()
81
+ self.norm1 = nn.RMSNorm(cfg.hidden_size)
82
+ self.attention = _Attention(cfg)
83
+ self.norm2 = nn.RMSNorm(cfg.hidden_size)
84
+ self.mlp = _MLP(cfg)
85
+ def forward(self, x, mask=None):
86
+ x = x + self.attention(self.norm1(x), mask)
87
+ x = x + self.mlp(self.norm2(x))
88
+ return x
89
+
90
+
91
+ class _KuiXingCore(nn.Module):
92
+ def __init__(self, cfg):
93
+ super().__init__()
94
+ self.token_emb = nn.Embedding(cfg.vocab_size, cfg.hidden_size)
95
+ self.pos_emb = nn.Embedding(cfg.max_position_embeddings, cfg.hidden_size)
96
+ self.layers = nn.ModuleList([_Block(cfg) for _ in range(cfg.num_hidden_layers)])
97
+ self.norm_final = nn.RMSNorm(cfg.hidden_size)
98
+ self.lm_head = nn.Linear(cfg.hidden_size, cfg.vocab_size, bias=False)
99
+ self.lm_head.weight = self.token_emb.weight
100
+
101
+ def forward(self, input_ids):
102
+ B, L = input_ids.shape
103
+ pos = torch.arange(L, device=input_ids.device)
104
+ h = self.token_emb(input_ids) + self.pos_emb(pos)
105
+ mask = torch.triu(
106
+ torch.full((L, L), float("-inf"), device=input_ids.device), diagonal=1
107
+ ).unsqueeze(0).unsqueeze(0)
108
+ for layer in self.layers:
109
+ h = layer(h, mask)
110
+ return self.lm_head(self.norm_final(h))
111
+
112
+
113
+ class KuiXingForCausalLM(PreTrainedModel, GenerationMixin):
114
+ config_class = KuiXingHFConfig
115
+ supports_gradient_checkpointing = False
116
+
117
+ def __init__(self, config):
118
+ super().__init__(config)
119
+ self.model = _KuiXingCore(config)
120
+ self.post_init()
121
+
122
+ @classmethod
123
+ def from_pretrained(cls, model_path, **kwargs):
124
+ import json
125
+ with open(os.path.join(model_path, "config.json")) as f:
126
+ cfg_dict = json.load(f)
127
+ valid = set(KuiXingHFConfig.__init__.__code__.co_varnames)
128
+ hf_cfg = KuiXingHFConfig(**{k: v for k, v in cfg_dict.items() if k in valid})
129
+ model = cls(hf_cfg)
130
+ sd = load_file(os.path.join(model_path, "model.safetensors"))
131
+ missing, unexpected = model.load_state_dict(sd, strict=False)
132
+ # lm_head.weight 不存入 safetensors(weight tying),載入後手動重建共享
133
+ model.model.lm_head.weight = model.model.token_emb.weight
134
+ # lm_head.weight 是刻意省略的(weight tying),從 missing 中排除再判斷
135
+ missing = [k for k in missing if k != "model.lm_head.weight"]
136
+ if not missing and not unexpected:
137
+ print("✅ 所有權重 key 完整對映,無缺漏。\n如需以 bfloat16 推理:model = model.to(torch.bfloat16).eval()")
138
+ else:
139
+ if missing: print(f"⚠️ 缺少 key({len(missing)}):{missing[:5]}")
140
+ if unexpected: print(f"⚠️ 多餘 key({len(unexpected)}):{unexpected[:5]}")
141
+ return model.eval()
142
+
143
+ def forward(self, input_ids=None, labels=None, **kwargs):
144
+ logits = self.model(input_ids)
145
+ loss = None
146
+ if labels is not None:
147
+ loss = F.cross_entropy(
148
+ logits[:, :-1].reshape(-1, logits.size(-1)),
149
+ labels[:, 1:].reshape(-1),
150
+ ignore_index=-100,
151
+ )
152
+ return CausalLMOutputWithPast(loss=loss, logits=logits)
153
+
154
+ def prepare_inputs_for_generation(self, input_ids, **kwargs):
155
+ return {"input_ids": input_ids}
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea55192fcb4081708ba5f50ea10a73974665eb4ed33f6ede0705ce3dc85b682d
3
+ size 1484540
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "PreTrainedTokenizerFast",
3
+ "model_max_length": 2048,
4
+ "padding_side": "right",
5
+ "bos_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "unk_token": "<unk>",
8
+ "pad_token": "<pad>"
9
+ }