N8Programs commited on
Commit
b8bde03
·
verified ·
1 Parent(s): 238b1dd

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ tags:
4
+ - mlx
5
+ library_name: mlx
6
+ pipeline_tag: text-generation
7
+ ---
8
+
9
+ ## Chat template
10
+
11
+ This model uses a standard `user` / `assistant` chat API surface, but renders
12
+ messages into TALKIE's play transcript format:
13
+
14
+ ```python
15
+ messages = [
16
+ {"role": "user", "content": "How are you?"},
17
+ ]
18
+ prompt = tokenizer.apply_chat_template(
19
+ messages,
20
+ tokenize=False,
21
+ add_generation_prompt=True,
22
+ )
23
+ ```
24
+
25
+ This produces:
26
+
27
+ ```text
28
+ The following conversation took place between the HUMAN, and TALKIE - a mechanical mind imbued with the knowledge of the world and the ability to use human language - a "thinking machine". It is published here for the benefit of the public:
29
+
30
+ HUMAN:
31
+
32
+ How are you?
33
+
34
+ TALKIE:
35
+
36
+ ```
37
+
38
+ ## Generation stops
39
+
40
+ Stop generation when TALKIE starts a new speaker turn. For direct MLX
41
+ generation, this model treats both `<|endoftext|>` (`65535`) and the `HUM`
42
+ token (`56180`) as EOS. That makes bare `mlx_lm.generate` and
43
+ `mlx_lm.batch_generate` stop when the model begins to write the next `HUMAN:`
44
+ turn.
45
+
46
+ The model package also includes `generation_config.json` with these stop
47
+ strings for runtimes that support text stops:
48
+
49
+ ```json
50
+ [
51
+ "\n\nHUMAN:",
52
+ "\nHUMAN:",
53
+ "HUMAN:",
54
+ "\n\nTALKIE:",
55
+ "\nTALKIE:",
56
+ "TALKIE:"
57
+ ]
58
+ ```
59
+
60
+ When serving through `mlx_lm.server`, pass the same strings as the request
61
+ `stop` field.
__pycache__/talkie_mlx.cpython-311.pyc ADDED
Binary file (14.7 kB). View file
 
config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "TalkieForCausalLM"
4
+ ],
5
+ "eos_token_id": [
6
+ 65535,
7
+ 56180
8
+ ],
9
+ "head_dim": 128,
10
+ "hidden_size": 5120,
11
+ "intermediate_size": 13696,
12
+ "max_position_embeddings": 2048,
13
+ "model_file": "talkie_mlx.py",
14
+ "num_attention_heads": 40,
15
+ "num_hidden_layers": 40,
16
+ "pad_token_id": 65535,
17
+ "rms_norm_eps": 1.1920928955078125e-07,
18
+ "rope_theta": 1000000.0,
19
+ "tie_word_embeddings": false,
20
+ "torch_dtype": "bfloat16",
21
+ "vocab_size": 65536
22
+ }
generation_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token_id": [
3
+ 65535,
4
+ 56180
5
+ ],
6
+ "pad_token_id": 65535,
7
+ "stop_strings": [
8
+ "\n\nHUMAN:",
9
+ "\nHUMAN:",
10
+ "HUMAN:",
11
+ "\n\nTALKIE:",
12
+ "\nTALKIE:",
13
+ "TALKIE:"
14
+ ]
15
+ }
model-00001-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82943a90614904e401d2f26e9374cda9ba931a15918b4737ffc3418c23e25bbb
3
+ size 5294007495
model-00002-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45d8a18b84f2a439e5c87f02719939758c0a0d7acd116fabe1daac435900cdae
3
+ size 5324154773
model-00003-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e52c1df6bcf1604ea6bec5888529a93ad8daaf215ab03e444d37747887d68df7
3
+ size 5341194581
model-00004-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a592baa0917f0c9540b07dd3eba97fe3efec06727dcd93ca47f49d03e4886e5b
3
+ size 5236336841
model-00005-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d70ec8a5fe5246bf2256b7f0366e77298367744207762335dd48f78ab528cfa
3
+ size 5364786256
model.safetensors.index.json ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 26560433522,
4
+ "total_parameters": 12944672441
5
+ },
6
+ "weight_map": {
7
+ "blocks.0.attn.attn_key.weight": "model-00001-of-00005.safetensors",
8
+ "blocks.0.attn.attn_query.weight": "model-00001-of-00005.safetensors",
9
+ "blocks.0.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
10
+ "blocks.0.attn.attn_value.weight": "model-00001-of-00005.safetensors",
11
+ "blocks.0.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
12
+ "blocks.0.attn_gain.a_g": "model-00001-of-00005.safetensors",
13
+ "blocks.0.embed_skip.a_g": "model-00001-of-00005.safetensors",
14
+ "blocks.0.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
15
+ "blocks.0.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
16
+ "blocks.0.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
17
+ "blocks.0.mlp_gain.a_g": "model-00001-of-00005.safetensors",
18
+ "blocks.1.attn.attn_key.weight": "model-00001-of-00005.safetensors",
19
+ "blocks.1.attn.attn_query.weight": "model-00001-of-00005.safetensors",
20
+ "blocks.1.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
21
+ "blocks.1.attn.attn_value.weight": "model-00001-of-00005.safetensors",
22
+ "blocks.1.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
23
+ "blocks.1.attn_gain.a_g": "model-00001-of-00005.safetensors",
24
+ "blocks.1.embed_skip.a_g": "model-00001-of-00005.safetensors",
25
+ "blocks.1.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
26
+ "blocks.1.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
27
+ "blocks.1.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
28
+ "blocks.1.mlp_gain.a_g": "model-00001-of-00005.safetensors",
29
+ "blocks.10.attn.attn_key.weight": "model-00002-of-00005.safetensors",
30
+ "blocks.10.attn.attn_query.weight": "model-00002-of-00005.safetensors",
31
+ "blocks.10.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
32
+ "blocks.10.attn.attn_value.weight": "model-00002-of-00005.safetensors",
33
+ "blocks.10.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
34
+ "blocks.10.attn_gain.a_g": "model-00002-of-00005.safetensors",
35
+ "blocks.10.embed_skip.a_g": "model-00002-of-00005.safetensors",
36
+ "blocks.10.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
37
+ "blocks.10.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
38
+ "blocks.10.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
39
+ "blocks.10.mlp_gain.a_g": "model-00002-of-00005.safetensors",
40
+ "blocks.11.attn.attn_key.weight": "model-00002-of-00005.safetensors",
41
+ "blocks.11.attn.attn_query.weight": "model-00002-of-00005.safetensors",
42
+ "blocks.11.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
43
+ "blocks.11.attn.attn_value.weight": "model-00002-of-00005.safetensors",
44
+ "blocks.11.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
45
+ "blocks.11.attn_gain.a_g": "model-00002-of-00005.safetensors",
46
+ "blocks.11.embed_skip.a_g": "model-00002-of-00005.safetensors",
47
+ "blocks.11.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
48
+ "blocks.11.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
49
+ "blocks.11.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
50
+ "blocks.11.mlp_gain.a_g": "model-00002-of-00005.safetensors",
51
+ "blocks.12.attn.attn_key.weight": "model-00002-of-00005.safetensors",
52
+ "blocks.12.attn.attn_query.weight": "model-00002-of-00005.safetensors",
53
+ "blocks.12.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
54
+ "blocks.12.attn.attn_value.weight": "model-00002-of-00005.safetensors",
55
+ "blocks.12.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
56
+ "blocks.12.attn_gain.a_g": "model-00002-of-00005.safetensors",
57
+ "blocks.12.embed_skip.a_g": "model-00002-of-00005.safetensors",
58
+ "blocks.12.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
59
+ "blocks.12.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
60
+ "blocks.12.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
61
+ "blocks.12.mlp_gain.a_g": "model-00002-of-00005.safetensors",
62
+ "blocks.13.attn.attn_key.weight": "model-00002-of-00005.safetensors",
63
+ "blocks.13.attn.attn_query.weight": "model-00002-of-00005.safetensors",
64
+ "blocks.13.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
65
+ "blocks.13.attn.attn_value.weight": "model-00002-of-00005.safetensors",
66
+ "blocks.13.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
67
+ "blocks.13.attn_gain.a_g": "model-00002-of-00005.safetensors",
68
+ "blocks.13.embed_skip.a_g": "model-00002-of-00005.safetensors",
69
+ "blocks.13.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
70
+ "blocks.13.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
71
+ "blocks.13.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
72
+ "blocks.13.mlp_gain.a_g": "model-00002-of-00005.safetensors",
73
+ "blocks.14.attn.attn_key.weight": "model-00002-of-00005.safetensors",
74
+ "blocks.14.attn.attn_query.weight": "model-00002-of-00005.safetensors",
75
+ "blocks.14.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
76
+ "blocks.14.attn.attn_value.weight": "model-00002-of-00005.safetensors",
77
+ "blocks.14.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
78
+ "blocks.14.attn_gain.a_g": "model-00002-of-00005.safetensors",
79
+ "blocks.14.embed_skip.a_g": "model-00002-of-00005.safetensors",
80
+ "blocks.14.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
81
+ "blocks.14.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
82
+ "blocks.14.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
83
+ "blocks.14.mlp_gain.a_g": "model-00002-of-00005.safetensors",
84
+ "blocks.15.attn.attn_key.weight": "model-00002-of-00005.safetensors",
85
+ "blocks.15.attn.attn_query.weight": "model-00002-of-00005.safetensors",
86
+ "blocks.15.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
87
+ "blocks.15.attn.attn_value.weight": "model-00002-of-00005.safetensors",
88
+ "blocks.15.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
89
+ "blocks.15.attn_gain.a_g": "model-00002-of-00005.safetensors",
90
+ "blocks.15.embed_skip.a_g": "model-00003-of-00005.safetensors",
91
+ "blocks.15.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
92
+ "blocks.15.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
93
+ "blocks.15.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
94
+ "blocks.15.mlp_gain.a_g": "model-00003-of-00005.safetensors",
95
+ "blocks.16.attn.attn_key.weight": "model-00003-of-00005.safetensors",
96
+ "blocks.16.attn.attn_query.weight": "model-00003-of-00005.safetensors",
97
+ "blocks.16.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
98
+ "blocks.16.attn.attn_value.weight": "model-00003-of-00005.safetensors",
99
+ "blocks.16.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
100
+ "blocks.16.attn_gain.a_g": "model-00003-of-00005.safetensors",
101
+ "blocks.16.embed_skip.a_g": "model-00003-of-00005.safetensors",
102
+ "blocks.16.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
103
+ "blocks.16.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
104
+ "blocks.16.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
105
+ "blocks.16.mlp_gain.a_g": "model-00003-of-00005.safetensors",
106
+ "blocks.17.attn.attn_key.weight": "model-00003-of-00005.safetensors",
107
+ "blocks.17.attn.attn_query.weight": "model-00003-of-00005.safetensors",
108
+ "blocks.17.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
109
+ "blocks.17.attn.attn_value.weight": "model-00003-of-00005.safetensors",
110
+ "blocks.17.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
111
+ "blocks.17.attn_gain.a_g": "model-00003-of-00005.safetensors",
112
+ "blocks.17.embed_skip.a_g": "model-00003-of-00005.safetensors",
113
+ "blocks.17.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
114
+ "blocks.17.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
115
+ "blocks.17.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
116
+ "blocks.17.mlp_gain.a_g": "model-00003-of-00005.safetensors",
117
+ "blocks.18.attn.attn_key.weight": "model-00003-of-00005.safetensors",
118
+ "blocks.18.attn.attn_query.weight": "model-00003-of-00005.safetensors",
119
+ "blocks.18.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
120
+ "blocks.18.attn.attn_value.weight": "model-00003-of-00005.safetensors",
121
+ "blocks.18.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
122
+ "blocks.18.attn_gain.a_g": "model-00003-of-00005.safetensors",
123
+ "blocks.18.embed_skip.a_g": "model-00003-of-00005.safetensors",
124
+ "blocks.18.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
125
+ "blocks.18.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
126
+ "blocks.18.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
127
+ "blocks.18.mlp_gain.a_g": "model-00003-of-00005.safetensors",
128
+ "blocks.19.attn.attn_key.weight": "model-00003-of-00005.safetensors",
129
+ "blocks.19.attn.attn_query.weight": "model-00003-of-00005.safetensors",
130
+ "blocks.19.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
131
+ "blocks.19.attn.attn_value.weight": "model-00003-of-00005.safetensors",
132
+ "blocks.19.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
133
+ "blocks.19.attn_gain.a_g": "model-00003-of-00005.safetensors",
134
+ "blocks.19.embed_skip.a_g": "model-00003-of-00005.safetensors",
135
+ "blocks.19.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
136
+ "blocks.19.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
137
+ "blocks.19.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
138
+ "blocks.19.mlp_gain.a_g": "model-00003-of-00005.safetensors",
139
+ "blocks.2.attn.attn_key.weight": "model-00001-of-00005.safetensors",
140
+ "blocks.2.attn.attn_query.weight": "model-00001-of-00005.safetensors",
141
+ "blocks.2.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
142
+ "blocks.2.attn.attn_value.weight": "model-00001-of-00005.safetensors",
143
+ "blocks.2.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
144
+ "blocks.2.attn_gain.a_g": "model-00001-of-00005.safetensors",
145
+ "blocks.2.embed_skip.a_g": "model-00001-of-00005.safetensors",
146
+ "blocks.2.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
147
+ "blocks.2.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
148
+ "blocks.2.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
149
+ "blocks.2.mlp_gain.a_g": "model-00001-of-00005.safetensors",
150
+ "blocks.20.attn.attn_key.weight": "model-00003-of-00005.safetensors",
151
+ "blocks.20.attn.attn_query.weight": "model-00003-of-00005.safetensors",
152
+ "blocks.20.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
153
+ "blocks.20.attn.attn_value.weight": "model-00003-of-00005.safetensors",
154
+ "blocks.20.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
155
+ "blocks.20.attn_gain.a_g": "model-00003-of-00005.safetensors",
156
+ "blocks.20.embed_skip.a_g": "model-00003-of-00005.safetensors",
157
+ "blocks.20.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
158
+ "blocks.20.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
159
+ "blocks.20.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
160
+ "blocks.20.mlp_gain.a_g": "model-00003-of-00005.safetensors",
161
+ "blocks.21.attn.attn_key.weight": "model-00003-of-00005.safetensors",
162
+ "blocks.21.attn.attn_query.weight": "model-00003-of-00005.safetensors",
163
+ "blocks.21.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
164
+ "blocks.21.attn.attn_value.weight": "model-00003-of-00005.safetensors",
165
+ "blocks.21.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
166
+ "blocks.21.attn_gain.a_g": "model-00003-of-00005.safetensors",
167
+ "blocks.21.embed_skip.a_g": "model-00003-of-00005.safetensors",
168
+ "blocks.21.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
169
+ "blocks.21.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
170
+ "blocks.21.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
171
+ "blocks.21.mlp_gain.a_g": "model-00003-of-00005.safetensors",
172
+ "blocks.22.attn.attn_key.weight": "model-00003-of-00005.safetensors",
173
+ "blocks.22.attn.attn_query.weight": "model-00003-of-00005.safetensors",
174
+ "blocks.22.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
175
+ "blocks.22.attn.attn_value.weight": "model-00003-of-00005.safetensors",
176
+ "blocks.22.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
177
+ "blocks.22.attn_gain.a_g": "model-00003-of-00005.safetensors",
178
+ "blocks.22.embed_skip.a_g": "model-00003-of-00005.safetensors",
179
+ "blocks.22.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
180
+ "blocks.22.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
181
+ "blocks.22.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
182
+ "blocks.22.mlp_gain.a_g": "model-00003-of-00005.safetensors",
183
+ "blocks.23.attn.attn_key.weight": "model-00003-of-00005.safetensors",
184
+ "blocks.23.attn.attn_query.weight": "model-00003-of-00005.safetensors",
185
+ "blocks.23.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
186
+ "blocks.23.attn.attn_value.weight": "model-00003-of-00005.safetensors",
187
+ "blocks.23.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
188
+ "blocks.23.attn_gain.a_g": "model-00003-of-00005.safetensors",
189
+ "blocks.23.embed_skip.a_g": "model-00003-of-00005.safetensors",
190
+ "blocks.23.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
191
+ "blocks.23.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
192
+ "blocks.23.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
193
+ "blocks.23.mlp_gain.a_g": "model-00003-of-00005.safetensors",
194
+ "blocks.24.attn.attn_key.weight": "model-00003-of-00005.safetensors",
195
+ "blocks.24.attn.attn_query.weight": "model-00003-of-00005.safetensors",
196
+ "blocks.24.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
197
+ "blocks.24.attn.attn_value.weight": "model-00003-of-00005.safetensors",
198
+ "blocks.24.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
199
+ "blocks.24.attn_gain.a_g": "model-00004-of-00005.safetensors",
200
+ "blocks.24.embed_skip.a_g": "model-00004-of-00005.safetensors",
201
+ "blocks.24.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
202
+ "blocks.24.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
203
+ "blocks.24.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
204
+ "blocks.24.mlp_gain.a_g": "model-00004-of-00005.safetensors",
205
+ "blocks.25.attn.attn_key.weight": "model-00004-of-00005.safetensors",
206
+ "blocks.25.attn.attn_query.weight": "model-00004-of-00005.safetensors",
207
+ "blocks.25.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
208
+ "blocks.25.attn.attn_value.weight": "model-00004-of-00005.safetensors",
209
+ "blocks.25.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
210
+ "blocks.25.attn_gain.a_g": "model-00004-of-00005.safetensors",
211
+ "blocks.25.embed_skip.a_g": "model-00004-of-00005.safetensors",
212
+ "blocks.25.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
213
+ "blocks.25.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
214
+ "blocks.25.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
215
+ "blocks.25.mlp_gain.a_g": "model-00004-of-00005.safetensors",
216
+ "blocks.26.attn.attn_key.weight": "model-00004-of-00005.safetensors",
217
+ "blocks.26.attn.attn_query.weight": "model-00004-of-00005.safetensors",
218
+ "blocks.26.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
219
+ "blocks.26.attn.attn_value.weight": "model-00004-of-00005.safetensors",
220
+ "blocks.26.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
221
+ "blocks.26.attn_gain.a_g": "model-00004-of-00005.safetensors",
222
+ "blocks.26.embed_skip.a_g": "model-00004-of-00005.safetensors",
223
+ "blocks.26.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
224
+ "blocks.26.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
225
+ "blocks.26.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
226
+ "blocks.26.mlp_gain.a_g": "model-00004-of-00005.safetensors",
227
+ "blocks.27.attn.attn_key.weight": "model-00004-of-00005.safetensors",
228
+ "blocks.27.attn.attn_query.weight": "model-00004-of-00005.safetensors",
229
+ "blocks.27.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
230
+ "blocks.27.attn.attn_value.weight": "model-00004-of-00005.safetensors",
231
+ "blocks.27.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
232
+ "blocks.27.attn_gain.a_g": "model-00004-of-00005.safetensors",
233
+ "blocks.27.embed_skip.a_g": "model-00004-of-00005.safetensors",
234
+ "blocks.27.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
235
+ "blocks.27.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
236
+ "blocks.27.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
237
+ "blocks.27.mlp_gain.a_g": "model-00004-of-00005.safetensors",
238
+ "blocks.28.attn.attn_key.weight": "model-00004-of-00005.safetensors",
239
+ "blocks.28.attn.attn_query.weight": "model-00004-of-00005.safetensors",
240
+ "blocks.28.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
241
+ "blocks.28.attn.attn_value.weight": "model-00004-of-00005.safetensors",
242
+ "blocks.28.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
243
+ "blocks.28.attn_gain.a_g": "model-00004-of-00005.safetensors",
244
+ "blocks.28.embed_skip.a_g": "model-00004-of-00005.safetensors",
245
+ "blocks.28.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
246
+ "blocks.28.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
247
+ "blocks.28.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
248
+ "blocks.28.mlp_gain.a_g": "model-00004-of-00005.safetensors",
249
+ "blocks.29.attn.attn_key.weight": "model-00004-of-00005.safetensors",
250
+ "blocks.29.attn.attn_query.weight": "model-00004-of-00005.safetensors",
251
+ "blocks.29.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
252
+ "blocks.29.attn.attn_value.weight": "model-00004-of-00005.safetensors",
253
+ "blocks.29.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
254
+ "blocks.29.attn_gain.a_g": "model-00004-of-00005.safetensors",
255
+ "blocks.29.embed_skip.a_g": "model-00004-of-00005.safetensors",
256
+ "blocks.29.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
257
+ "blocks.29.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
258
+ "blocks.29.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
259
+ "blocks.29.mlp_gain.a_g": "model-00004-of-00005.safetensors",
260
+ "blocks.3.attn.attn_key.weight": "model-00001-of-00005.safetensors",
261
+ "blocks.3.attn.attn_query.weight": "model-00001-of-00005.safetensors",
262
+ "blocks.3.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
263
+ "blocks.3.attn.attn_value.weight": "model-00001-of-00005.safetensors",
264
+ "blocks.3.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
265
+ "blocks.3.attn_gain.a_g": "model-00001-of-00005.safetensors",
266
+ "blocks.3.embed_skip.a_g": "model-00001-of-00005.safetensors",
267
+ "blocks.3.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
268
+ "blocks.3.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
269
+ "blocks.3.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
270
+ "blocks.3.mlp_gain.a_g": "model-00001-of-00005.safetensors",
271
+ "blocks.30.attn.attn_key.weight": "model-00004-of-00005.safetensors",
272
+ "blocks.30.attn.attn_query.weight": "model-00004-of-00005.safetensors",
273
+ "blocks.30.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
274
+ "blocks.30.attn.attn_value.weight": "model-00004-of-00005.safetensors",
275
+ "blocks.30.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
276
+ "blocks.30.attn_gain.a_g": "model-00004-of-00005.safetensors",
277
+ "blocks.30.embed_skip.a_g": "model-00004-of-00005.safetensors",
278
+ "blocks.30.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
279
+ "blocks.30.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
280
+ "blocks.30.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
281
+ "blocks.30.mlp_gain.a_g": "model-00004-of-00005.safetensors",
282
+ "blocks.31.attn.attn_key.weight": "model-00004-of-00005.safetensors",
283
+ "blocks.31.attn.attn_query.weight": "model-00004-of-00005.safetensors",
284
+ "blocks.31.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
285
+ "blocks.31.attn.attn_value.weight": "model-00004-of-00005.safetensors",
286
+ "blocks.31.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
287
+ "blocks.31.attn_gain.a_g": "model-00004-of-00005.safetensors",
288
+ "blocks.31.embed_skip.a_g": "model-00004-of-00005.safetensors",
289
+ "blocks.31.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
290
+ "blocks.31.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
291
+ "blocks.31.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
292
+ "blocks.31.mlp_gain.a_g": "model-00004-of-00005.safetensors",
293
+ "blocks.32.attn.attn_key.weight": "model-00004-of-00005.safetensors",
294
+ "blocks.32.attn.attn_query.weight": "model-00004-of-00005.safetensors",
295
+ "blocks.32.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
296
+ "blocks.32.attn.attn_value.weight": "model-00004-of-00005.safetensors",
297
+ "blocks.32.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
298
+ "blocks.32.attn_gain.a_g": "model-00004-of-00005.safetensors",
299
+ "blocks.32.embed_skip.a_g": "model-00005-of-00005.safetensors",
300
+ "blocks.32.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
301
+ "blocks.32.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
302
+ "blocks.32.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
303
+ "blocks.32.mlp_gain.a_g": "model-00005-of-00005.safetensors",
304
+ "blocks.33.attn.attn_key.weight": "model-00005-of-00005.safetensors",
305
+ "blocks.33.attn.attn_query.weight": "model-00005-of-00005.safetensors",
306
+ "blocks.33.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
307
+ "blocks.33.attn.attn_value.weight": "model-00005-of-00005.safetensors",
308
+ "blocks.33.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
309
+ "blocks.33.attn_gain.a_g": "model-00005-of-00005.safetensors",
310
+ "blocks.33.embed_skip.a_g": "model-00005-of-00005.safetensors",
311
+ "blocks.33.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
312
+ "blocks.33.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
313
+ "blocks.33.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
314
+ "blocks.33.mlp_gain.a_g": "model-00005-of-00005.safetensors",
315
+ "blocks.34.attn.attn_key.weight": "model-00005-of-00005.safetensors",
316
+ "blocks.34.attn.attn_query.weight": "model-00005-of-00005.safetensors",
317
+ "blocks.34.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
318
+ "blocks.34.attn.attn_value.weight": "model-00005-of-00005.safetensors",
319
+ "blocks.34.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
320
+ "blocks.34.attn_gain.a_g": "model-00005-of-00005.safetensors",
321
+ "blocks.34.embed_skip.a_g": "model-00005-of-00005.safetensors",
322
+ "blocks.34.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
323
+ "blocks.34.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
324
+ "blocks.34.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
325
+ "blocks.34.mlp_gain.a_g": "model-00005-of-00005.safetensors",
326
+ "blocks.35.attn.attn_key.weight": "model-00005-of-00005.safetensors",
327
+ "blocks.35.attn.attn_query.weight": "model-00005-of-00005.safetensors",
328
+ "blocks.35.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
329
+ "blocks.35.attn.attn_value.weight": "model-00005-of-00005.safetensors",
330
+ "blocks.35.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
331
+ "blocks.35.attn_gain.a_g": "model-00005-of-00005.safetensors",
332
+ "blocks.35.embed_skip.a_g": "model-00005-of-00005.safetensors",
333
+ "blocks.35.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
334
+ "blocks.35.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
335
+ "blocks.35.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
336
+ "blocks.35.mlp_gain.a_g": "model-00005-of-00005.safetensors",
337
+ "blocks.36.attn.attn_key.weight": "model-00005-of-00005.safetensors",
338
+ "blocks.36.attn.attn_query.weight": "model-00005-of-00005.safetensors",
339
+ "blocks.36.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
340
+ "blocks.36.attn.attn_value.weight": "model-00005-of-00005.safetensors",
341
+ "blocks.36.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
342
+ "blocks.36.attn_gain.a_g": "model-00005-of-00005.safetensors",
343
+ "blocks.36.embed_skip.a_g": "model-00005-of-00005.safetensors",
344
+ "blocks.36.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
345
+ "blocks.36.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
346
+ "blocks.36.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
347
+ "blocks.36.mlp_gain.a_g": "model-00005-of-00005.safetensors",
348
+ "blocks.37.attn.attn_key.weight": "model-00005-of-00005.safetensors",
349
+ "blocks.37.attn.attn_query.weight": "model-00005-of-00005.safetensors",
350
+ "blocks.37.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
351
+ "blocks.37.attn.attn_value.weight": "model-00005-of-00005.safetensors",
352
+ "blocks.37.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
353
+ "blocks.37.attn_gain.a_g": "model-00005-of-00005.safetensors",
354
+ "blocks.37.embed_skip.a_g": "model-00005-of-00005.safetensors",
355
+ "blocks.37.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
356
+ "blocks.37.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
357
+ "blocks.37.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
358
+ "blocks.37.mlp_gain.a_g": "model-00005-of-00005.safetensors",
359
+ "blocks.38.attn.attn_key.weight": "model-00005-of-00005.safetensors",
360
+ "blocks.38.attn.attn_query.weight": "model-00005-of-00005.safetensors",
361
+ "blocks.38.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
362
+ "blocks.38.attn.attn_value.weight": "model-00005-of-00005.safetensors",
363
+ "blocks.38.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
364
+ "blocks.38.attn_gain.a_g": "model-00005-of-00005.safetensors",
365
+ "blocks.38.embed_skip.a_g": "model-00005-of-00005.safetensors",
366
+ "blocks.38.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
367
+ "blocks.38.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
368
+ "blocks.38.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
369
+ "blocks.38.mlp_gain.a_g": "model-00005-of-00005.safetensors",
370
+ "blocks.39.attn.attn_key.weight": "model-00005-of-00005.safetensors",
371
+ "blocks.39.attn.attn_query.weight": "model-00005-of-00005.safetensors",
372
+ "blocks.39.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
373
+ "blocks.39.attn.attn_value.weight": "model-00005-of-00005.safetensors",
374
+ "blocks.39.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
375
+ "blocks.39.attn_gain.a_g": "model-00005-of-00005.safetensors",
376
+ "blocks.39.embed_skip.a_g": "model-00005-of-00005.safetensors",
377
+ "blocks.39.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
378
+ "blocks.39.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
379
+ "blocks.39.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
380
+ "blocks.39.mlp_gain.a_g": "model-00005-of-00005.safetensors",
381
+ "blocks.4.attn.attn_key.weight": "model-00001-of-00005.safetensors",
382
+ "blocks.4.attn.attn_query.weight": "model-00001-of-00005.safetensors",
383
+ "blocks.4.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
384
+ "blocks.4.attn.attn_value.weight": "model-00001-of-00005.safetensors",
385
+ "blocks.4.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
386
+ "blocks.4.attn_gain.a_g": "model-00001-of-00005.safetensors",
387
+ "blocks.4.embed_skip.a_g": "model-00001-of-00005.safetensors",
388
+ "blocks.4.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
389
+ "blocks.4.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
390
+ "blocks.4.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
391
+ "blocks.4.mlp_gain.a_g": "model-00001-of-00005.safetensors",
392
+ "blocks.5.attn.attn_key.weight": "model-00001-of-00005.safetensors",
393
+ "blocks.5.attn.attn_query.weight": "model-00001-of-00005.safetensors",
394
+ "blocks.5.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
395
+ "blocks.5.attn.attn_value.weight": "model-00001-of-00005.safetensors",
396
+ "blocks.5.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
397
+ "blocks.5.attn_gain.a_g": "model-00001-of-00005.safetensors",
398
+ "blocks.5.embed_skip.a_g": "model-00001-of-00005.safetensors",
399
+ "blocks.5.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
400
+ "blocks.5.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
401
+ "blocks.5.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
402
+ "blocks.5.mlp_gain.a_g": "model-00001-of-00005.safetensors",
403
+ "blocks.6.attn.attn_key.weight": "model-00001-of-00005.safetensors",
404
+ "blocks.6.attn.attn_query.weight": "model-00001-of-00005.safetensors",
405
+ "blocks.6.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
406
+ "blocks.6.attn.attn_value.weight": "model-00001-of-00005.safetensors",
407
+ "blocks.6.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
408
+ "blocks.6.attn_gain.a_g": "model-00001-of-00005.safetensors",
409
+ "blocks.6.embed_skip.a_g": "model-00001-of-00005.safetensors",
410
+ "blocks.6.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
411
+ "blocks.6.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
412
+ "blocks.6.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
413
+ "blocks.6.mlp_gain.a_g": "model-00001-of-00005.safetensors",
414
+ "blocks.7.attn.attn_key.weight": "model-00001-of-00005.safetensors",
415
+ "blocks.7.attn.attn_query.weight": "model-00001-of-00005.safetensors",
416
+ "blocks.7.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
417
+ "blocks.7.attn.attn_value.weight": "model-00001-of-00005.safetensors",
418
+ "blocks.7.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
419
+ "blocks.7.attn_gain.a_g": "model-00001-of-00005.safetensors",
420
+ "blocks.7.embed_skip.a_g": "model-00002-of-00005.safetensors",
421
+ "blocks.7.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
422
+ "blocks.7.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
423
+ "blocks.7.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
424
+ "blocks.7.mlp_gain.a_g": "model-00002-of-00005.safetensors",
425
+ "blocks.8.attn.attn_key.weight": "model-00002-of-00005.safetensors",
426
+ "blocks.8.attn.attn_query.weight": "model-00002-of-00005.safetensors",
427
+ "blocks.8.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
428
+ "blocks.8.attn.attn_value.weight": "model-00002-of-00005.safetensors",
429
+ "blocks.8.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
430
+ "blocks.8.attn_gain.a_g": "model-00002-of-00005.safetensors",
431
+ "blocks.8.embed_skip.a_g": "model-00002-of-00005.safetensors",
432
+ "blocks.8.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
433
+ "blocks.8.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
434
+ "blocks.8.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
435
+ "blocks.8.mlp_gain.a_g": "model-00002-of-00005.safetensors",
436
+ "blocks.9.attn.attn_key.weight": "model-00002-of-00005.safetensors",
437
+ "blocks.9.attn.attn_query.weight": "model-00002-of-00005.safetensors",
438
+ "blocks.9.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
439
+ "blocks.9.attn.attn_value.weight": "model-00002-of-00005.safetensors",
440
+ "blocks.9.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
441
+ "blocks.9.attn_gain.a_g": "model-00002-of-00005.safetensors",
442
+ "blocks.9.embed_skip.a_g": "model-00002-of-00005.safetensors",
443
+ "blocks.9.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
444
+ "blocks.9.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
445
+ "blocks.9.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
446
+ "blocks.9.mlp_gain.a_g": "model-00002-of-00005.safetensors",
447
+ "embed.weight": "model-00001-of-00005.safetensors",
448
+ "lm_head": "model-00005-of-00005.safetensors",
449
+ "lm_head_gain.w_g": "model-00005-of-00005.safetensors"
450
+ }
451
+ }
talkie_mlx.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2026
2
+ #
3
+ # MLX implementation of talkie-lm/talkie-1930-13b-base.
4
+ # This file is intentionally self-contained so an MLX model directory can load it
5
+ # through config.json: {"model_file": "talkie_mlx.py"}.
6
+
7
+ import math
8
+ from dataclasses import dataclass
9
+ from typing import Any, Optional
10
+
11
+ import mlx.core as mx
12
+ import mlx.nn as nn
13
+
14
+ from mlx_lm.models.base import BaseModelArgs, create_attention_mask
15
+ from mlx_lm.models.base import scaled_dot_product_attention
16
+
17
+
18
+ @dataclass
19
+ class ModelArgs(BaseModelArgs):
20
+ model_type: str = "talkie"
21
+ vocab_size: int = 65536
22
+ hidden_size: int = 5120
23
+ num_hidden_layers: int = 40
24
+ num_attention_heads: int = 40
25
+ intermediate_size: int = 13696
26
+ head_dim: int = 128
27
+ max_position_embeddings: int = 2048
28
+ rope_theta: float = 1_000_000.0
29
+ tie_word_embeddings: bool = False
30
+ rms_norm_eps: Optional[float] = 1.1920928955078125e-7
31
+
32
+
33
+ def rms_norm(x: mx.array, eps: Optional[float] = None) -> mx.array:
34
+ if eps is None:
35
+ eps = mx.finfo(x.dtype).eps
36
+ return mx.fast.rms_norm(x, None, eps)
37
+
38
+
39
+ def apply_talkie_rope(x: mx.array, offset: int, base: float) -> mx.array:
40
+ """Apply Talkie's split-half RoPE to tensors shaped [B, H, T, D]."""
41
+ head_dim = x.shape[-1]
42
+ half_dim = head_dim // 2
43
+ freqs = -mx.exp(
44
+ mx.arange(0.0, half_dim, dtype=mx.float32) * (math.log(base) / half_dim)
45
+ )
46
+ return mx.fast.rope(
47
+ x,
48
+ dims=head_dim,
49
+ traditional=False,
50
+ base=None,
51
+ freqs=freqs,
52
+ scale=1.0,
53
+ offset=offset,
54
+ )
55
+
56
+
57
+ class HeadGain(nn.Module):
58
+ def __init__(self, num_heads: int):
59
+ super().__init__()
60
+ self.head_g = mx.ones((num_heads,), dtype=mx.float32)
61
+
62
+ def __call__(self, x: mx.array) -> mx.array:
63
+ return x * self.head_g.astype(x.dtype).reshape(1, -1, 1, 1)
64
+
65
+
66
+ class WeightGain(nn.Module):
67
+ def __init__(self):
68
+ super().__init__()
69
+ self.w_g = mx.ones((1,), dtype=mx.float32)
70
+
71
+ def __call__(self, w: mx.array) -> mx.array:
72
+ return w * self.w_g.astype(w.dtype)
73
+
74
+
75
+ class ActGain(nn.Module):
76
+ def __init__(self, init_value: float):
77
+ super().__init__()
78
+ self.a_g = mx.array([init_value], dtype=mx.float32)
79
+
80
+ def __call__(self, x: mx.array) -> mx.array:
81
+ return x * self.a_g.astype(x.dtype)
82
+
83
+
84
+ class CausalSelfAttention(nn.Module):
85
+ def __init__(self, args: ModelArgs):
86
+ super().__init__()
87
+ self.n_head = args.num_attention_heads
88
+ self.head_dim = args.head_dim
89
+ self.rope_theta = args.rope_theta
90
+ self.rms_norm_eps = args.rms_norm_eps
91
+ self.scale = self.head_dim**-0.5
92
+
93
+ n_state = args.hidden_size
94
+ self.attn_query = nn.Linear(n_state, n_state, bias=False)
95
+ self.attn_key = nn.Linear(n_state, n_state, bias=False)
96
+ self.attn_value = nn.Linear(n_state, n_state, bias=False)
97
+ self.attn_resid = nn.Linear(n_state, n_state, bias=False)
98
+ self.head_gain = HeadGain(self.n_head)
99
+
100
+ def __call__(
101
+ self,
102
+ x: mx.array,
103
+ mask: Optional[mx.array] = None,
104
+ cache: Optional[Any] = None,
105
+ ) -> mx.array:
106
+ bsz, seq_len, _ = x.shape
107
+
108
+ q = self.attn_query(x).reshape(bsz, seq_len, self.n_head, self.head_dim)
109
+ k = self.attn_key(x).reshape(bsz, seq_len, self.n_head, self.head_dim)
110
+ v = self.attn_value(x).reshape(bsz, seq_len, self.n_head, self.head_dim)
111
+
112
+ q = q.transpose(0, 2, 1, 3)
113
+ k = k.transpose(0, 2, 1, 3)
114
+ v = v.transpose(0, 2, 1, 3)
115
+
116
+ offset = cache.offset if cache is not None else 0
117
+ q = apply_talkie_rope(q, offset=offset, base=self.rope_theta)
118
+ k = apply_talkie_rope(k, offset=offset, base=self.rope_theta)
119
+
120
+ q = rms_norm(q, self.rms_norm_eps)
121
+ k = rms_norm(k, self.rms_norm_eps)
122
+ q = self.head_gain(q)
123
+
124
+ if cache is not None:
125
+ k, v = cache.update_and_fetch(k, v)
126
+
127
+ y = scaled_dot_product_attention(
128
+ q, k, v, cache=cache, scale=self.scale, mask=mask
129
+ )
130
+ y = y.transpose(0, 2, 1, 3).reshape(bsz, seq_len, -1)
131
+ return self.attn_resid(y)
132
+
133
+
134
+ class MLP(nn.Module):
135
+ def __init__(self, args: ModelArgs):
136
+ super().__init__()
137
+ n_state = args.hidden_size
138
+ n_mlp = args.intermediate_size
139
+ self.mlp_gate = nn.Linear(n_state, n_mlp, bias=False)
140
+ self.mlp_linear = nn.Linear(n_state, n_mlp, bias=False)
141
+ self.mlp_resid = nn.Linear(n_mlp, n_state, bias=False)
142
+
143
+ def __call__(self, x: mx.array) -> mx.array:
144
+ gate = self.mlp_gate(x)
145
+ x = gate * mx.sigmoid(gate) * self.mlp_linear(x)
146
+ return self.mlp_resid(x)
147
+
148
+
149
+ class Block(nn.Module):
150
+ def __init__(self, args: ModelArgs):
151
+ super().__init__()
152
+ init_gain = (2 * args.num_hidden_layers) ** -0.5
153
+ self.attn = CausalSelfAttention(args)
154
+ self.attn_gain = ActGain(init_gain)
155
+ self.mlp = MLP(args)
156
+ self.mlp_gain = ActGain(init_gain)
157
+ self.embed_skip = ActGain(0.0)
158
+ self.rms_norm_eps = args.rms_norm_eps
159
+
160
+ def __call__(
161
+ self,
162
+ e_x: mx.array,
163
+ x: mx.array,
164
+ mask: Optional[mx.array] = None,
165
+ cache: Optional[Any] = None,
166
+ ) -> mx.array:
167
+ x = x + self.attn_gain(self.attn(rms_norm(x, self.rms_norm_eps), mask, cache))
168
+ x = x + self.mlp_gain(self.mlp(rms_norm(x, self.rms_norm_eps)))
169
+ x = x + self.embed_skip(e_x)
170
+ return x
171
+
172
+
173
+ class Model(nn.Module):
174
+ def __init__(self, args: ModelArgs):
175
+ super().__init__()
176
+ self.args = args
177
+ self.model_type = args.model_type
178
+ self.embed = nn.Embedding(args.vocab_size, args.hidden_size)
179
+ self.blocks = [Block(args) for _ in range(args.num_hidden_layers)]
180
+ self.lm_head = mx.zeros((args.vocab_size, args.hidden_size), dtype=mx.float32)
181
+ self.lm_head_gain = WeightGain()
182
+
183
+ def __call__(
184
+ self,
185
+ input_ids: mx.array,
186
+ cache: Optional[Any] = None,
187
+ input_embeddings: Optional[mx.array] = None,
188
+ ) -> mx.array:
189
+ if input_embeddings is not None:
190
+ x = input_embeddings
191
+ else:
192
+ x = self.embed(input_ids)
193
+
194
+ x = rms_norm(x, self.args.rms_norm_eps)
195
+ e_x = x
196
+
197
+ if cache is None:
198
+ cache = [None] * len(self.blocks)
199
+ mask = create_attention_mask(x, cache[0])
200
+
201
+ for block, c in zip(self.blocks, cache):
202
+ x = block(e_x, x, mask=mask, cache=c)
203
+
204
+ x = rms_norm(x, self.args.rms_norm_eps)
205
+ return x @ self.lm_head_gain(self.lm_head).T
206
+
207
+ @property
208
+ def layers(self):
209
+ return self.blocks
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "chat_template": "{%- set prelude = 'The following conversation took place between the HUMAN, and TALKIE - a mechanical mind imbued with the knowledge of the world and the ability to use human language - a \"thinking machine\". It is published here for the benefit of the public:' -%}{{- prelude -}}{%- for message in messages -%}{%- if message['role'] == 'user' -%}{{- '\\n\\nHUMAN:\\n\\n' + (message['content'] | trim) -}}{%- elif message['role'] == 'assistant' -%}{{- '\\n\\nTALKIE:\\n\\n' + (message['content'] | trim) -}}{%- elif message['role'] == 'system' -%}{{- '\\n\\n' + (message['content'] | trim) -}}{%- else -%}{{- raise_exception('Unsupported role: ' + message['role']) -}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt and (messages | length == 0 or messages[-1]['role'] != 'assistant') -%}{{- '\\n\\nTALKIE:\\n\\n' -}}{%- endif -%}",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|endoftext|>",
6
+ "is_local": true,
7
+ "model_max_length": 2048,
8
+ "pad_token": "<|endoftext|>",
9
+ "tokenizer_class": "TokenizersBackend"
10
+ }