Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

README.md +61 -0
__pycache__/talkie_mlx.cpython-311.pyc +0 -0
config.json +22 -0
generation_config.json +15 -0
model-00001-of-00005.safetensors +3 -0
model-00002-of-00005.safetensors +3 -0
model-00003-of-00005.safetensors +3 -0
model-00004-of-00005.safetensors +3 -0
model-00005-of-00005.safetensors +3 -0
model.safetensors.index.json +451 -0
talkie_mlx.py +209 -0
tokenizer.json +0 -0
tokenizer_config.json +10 -0

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+language: en
+tags:
+- mlx
+library_name: mlx
+pipeline_tag: text-generation
+---
+## Chat template
+This model uses a standard `user` / `assistant` chat API surface, but renders
+messages into TALKIE's play transcript format:
+```python
+messages = [
+    {"role": "user", "content": "How are you?"},
+]
+prompt = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+)
+```
+This produces:
+```text
+The following conversation took place between the HUMAN, and TALKIE - a mechanical mind imbued with the knowledge of the world and the ability to use human language - a "thinking machine". It is published here for the benefit of the public:
+HUMAN:
+How are you?
+TALKIE:
+```
+## Generation stops
+Stop generation when TALKIE starts a new speaker turn. For direct MLX
+generation, this model treats both `<|endoftext|>` (`65535`) and the `HUM`
+token (`56180`) as EOS. That makes bare `mlx_lm.generate` and
+`mlx_lm.batch_generate` stop when the model begins to write the next `HUMAN:`
+turn.
+The model package also includes `generation_config.json` with these stop
+strings for runtimes that support text stops:
+```json
+[
+  "\n\nHUMAN:",
+  "\nHUMAN:",
+  "HUMAN:",
+  "\n\nTALKIE:",
+  "\nTALKIE:",
+  "TALKIE:"
+]
+```
+When serving through `mlx_lm.server`, pass the same strings as the request
+`stop` field.

__pycache__/talkie_mlx.cpython-311.pyc ADDED Viewed

Binary file (14.7 kB). View file

config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "architectures": [
+        "TalkieForCausalLM"
+    ],
+    "eos_token_id": [
+        65535,
+        56180
+    ],
+    "head_dim": 128,
+    "hidden_size": 5120,
+    "intermediate_size": 13696,
+    "max_position_embeddings": 2048,
+    "model_file": "talkie_mlx.py",
+    "num_attention_heads": 40,
+    "num_hidden_layers": 40,
+    "pad_token_id": 65535,
+    "rms_norm_eps": 1.1920928955078125e-07,
+    "rope_theta": 1000000.0,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "vocab_size": 65536
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "eos_token_id": [
+    65535,
+    56180
+  ],
+  "pad_token_id": 65535,
+  "stop_strings": [
+    "\n\nHUMAN:",
+    "\nHUMAN:",
+    "HUMAN:",
+    "\n\nTALKIE:",
+    "\nTALKIE:",
+    "TALKIE:"
+  ]
+}

model-00001-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82943a90614904e401d2f26e9374cda9ba931a15918b4737ffc3418c23e25bbb
+size 5294007495

model-00002-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45d8a18b84f2a439e5c87f02719939758c0a0d7acd116fabe1daac435900cdae
+size 5324154773

model-00003-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e52c1df6bcf1604ea6bec5888529a93ad8daaf215ab03e444d37747887d68df7
+size 5341194581

model-00004-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a592baa0917f0c9540b07dd3eba97fe3efec06727dcd93ca47f49d03e4886e5b
+size 5236336841

model-00005-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d70ec8a5fe5246bf2256b7f0366e77298367744207762335dd48f78ab528cfa
+size 5364786256

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,451 @@

+{
+    "metadata": {
+        "total_size": 26560433522,
+        "total_parameters": 12944672441
+    },
+    "weight_map": {
+        "blocks.0.attn.attn_key.weight": "model-00001-of-00005.safetensors",
+        "blocks.0.attn.attn_query.weight": "model-00001-of-00005.safetensors",
+        "blocks.0.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.0.attn.attn_value.weight": "model-00001-of-00005.safetensors",
+        "blocks.0.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
+        "blocks.0.attn_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.0.embed_skip.a_g": "model-00001-of-00005.safetensors",
+        "blocks.0.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
+        "blocks.0.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
+        "blocks.0.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.0.mlp_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.1.attn.attn_key.weight": "model-00001-of-00005.safetensors",
+        "blocks.1.attn.attn_query.weight": "model-00001-of-00005.safetensors",
+        "blocks.1.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.1.attn.attn_value.weight": "model-00001-of-00005.safetensors",
+        "blocks.1.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
+        "blocks.1.attn_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.1.embed_skip.a_g": "model-00001-of-00005.safetensors",
+        "blocks.1.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
+        "blocks.1.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
+        "blocks.1.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.1.mlp_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.10.attn.attn_key.weight": "model-00002-of-00005.safetensors",
+        "blocks.10.attn.attn_query.weight": "model-00002-of-00005.safetensors",
+        "blocks.10.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.10.attn.attn_value.weight": "model-00002-of-00005.safetensors",
+        "blocks.10.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
+        "blocks.10.attn_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.10.embed_skip.a_g": "model-00002-of-00005.safetensors",
+        "blocks.10.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
+        "blocks.10.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
+        "blocks.10.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.10.mlp_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.11.attn.attn_key.weight": "model-00002-of-00005.safetensors",
+        "blocks.11.attn.attn_query.weight": "model-00002-of-00005.safetensors",
+        "blocks.11.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.11.attn.attn_value.weight": "model-00002-of-00005.safetensors",
+        "blocks.11.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
+        "blocks.11.attn_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.11.embed_skip.a_g": "model-00002-of-00005.safetensors",
+        "blocks.11.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
+        "blocks.11.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
+        "blocks.11.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.11.mlp_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.12.attn.attn_key.weight": "model-00002-of-00005.safetensors",
+        "blocks.12.attn.attn_query.weight": "model-00002-of-00005.safetensors",
+        "blocks.12.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.12.attn.attn_value.weight": "model-00002-of-00005.safetensors",
+        "blocks.12.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
+        "blocks.12.attn_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.12.embed_skip.a_g": "model-00002-of-00005.safetensors",
+        "blocks.12.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
+        "blocks.12.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
+        "blocks.12.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.12.mlp_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.13.attn.attn_key.weight": "model-00002-of-00005.safetensors",
+        "blocks.13.attn.attn_query.weight": "model-00002-of-00005.safetensors",
+        "blocks.13.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.13.attn.attn_value.weight": "model-00002-of-00005.safetensors",
+        "blocks.13.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
+        "blocks.13.attn_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.13.embed_skip.a_g": "model-00002-of-00005.safetensors",
+        "blocks.13.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
+        "blocks.13.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
+        "blocks.13.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.13.mlp_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.14.attn.attn_key.weight": "model-00002-of-00005.safetensors",
+        "blocks.14.attn.attn_query.weight": "model-00002-of-00005.safetensors",
+        "blocks.14.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.14.attn.attn_value.weight": "model-00002-of-00005.safetensors",
+        "blocks.14.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
+        "blocks.14.attn_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.14.embed_skip.a_g": "model-00002-of-00005.safetensors",
+        "blocks.14.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
+        "blocks.14.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
+        "blocks.14.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.14.mlp_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.15.attn.attn_key.weight": "model-00002-of-00005.safetensors",
+        "blocks.15.attn.attn_query.weight": "model-00002-of-00005.safetensors",
+        "blocks.15.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.15.attn.attn_value.weight": "model-00002-of-00005.safetensors",
+        "blocks.15.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
+        "blocks.15.attn_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.15.embed_skip.a_g": "model-00003-of-00005.safetensors",
+        "blocks.15.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
+        "blocks.15.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
+        "blocks.15.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.15.mlp_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.16.attn.attn_key.weight": "model-00003-of-00005.safetensors",
+        "blocks.16.attn.attn_query.weight": "model-00003-of-00005.safetensors",
+        "blocks.16.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.16.attn.attn_value.weight": "model-00003-of-00005.safetensors",
+        "blocks.16.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
+        "blocks.16.attn_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.16.embed_skip.a_g": "model-00003-of-00005.safetensors",
+        "blocks.16.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
+        "blocks.16.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
+        "blocks.16.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.16.mlp_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.17.attn.attn_key.weight": "model-00003-of-00005.safetensors",
+        "blocks.17.attn.attn_query.weight": "model-00003-of-00005.safetensors",
+        "blocks.17.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.17.attn.attn_value.weight": "model-00003-of-00005.safetensors",
+        "blocks.17.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
+        "blocks.17.attn_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.17.embed_skip.a_g": "model-00003-of-00005.safetensors",
+        "blocks.17.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
+        "blocks.17.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
+        "blocks.17.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.17.mlp_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.18.attn.attn_key.weight": "model-00003-of-00005.safetensors",
+        "blocks.18.attn.attn_query.weight": "model-00003-of-00005.safetensors",
+        "blocks.18.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.18.attn.attn_value.weight": "model-00003-of-00005.safetensors",
+        "blocks.18.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
+        "blocks.18.attn_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.18.embed_skip.a_g": "model-00003-of-00005.safetensors",
+        "blocks.18.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
+        "blocks.18.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
+        "blocks.18.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.18.mlp_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.19.attn.attn_key.weight": "model-00003-of-00005.safetensors",
+        "blocks.19.attn.attn_query.weight": "model-00003-of-00005.safetensors",
+        "blocks.19.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.19.attn.attn_value.weight": "model-00003-of-00005.safetensors",
+        "blocks.19.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
+        "blocks.19.attn_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.19.embed_skip.a_g": "model-00003-of-00005.safetensors",
+        "blocks.19.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
+        "blocks.19.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
+        "blocks.19.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.19.mlp_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.2.attn.attn_key.weight": "model-00001-of-00005.safetensors",
+        "blocks.2.attn.attn_query.weight": "model-00001-of-00005.safetensors",
+        "blocks.2.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.2.attn.attn_value.weight": "model-00001-of-00005.safetensors",
+        "blocks.2.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
+        "blocks.2.attn_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.2.embed_skip.a_g": "model-00001-of-00005.safetensors",
+        "blocks.2.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
+        "blocks.2.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
+        "blocks.2.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.2.mlp_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.20.attn.attn_key.weight": "model-00003-of-00005.safetensors",
+        "blocks.20.attn.attn_query.weight": "model-00003-of-00005.safetensors",
+        "blocks.20.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.20.attn.attn_value.weight": "model-00003-of-00005.safetensors",
+        "blocks.20.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
+        "blocks.20.attn_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.20.embed_skip.a_g": "model-00003-of-00005.safetensors",
+        "blocks.20.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
+        "blocks.20.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
+        "blocks.20.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.20.mlp_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.21.attn.attn_key.weight": "model-00003-of-00005.safetensors",
+        "blocks.21.attn.attn_query.weight": "model-00003-of-00005.safetensors",
+        "blocks.21.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.21.attn.attn_value.weight": "model-00003-of-00005.safetensors",
+        "blocks.21.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
+        "blocks.21.attn_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.21.embed_skip.a_g": "model-00003-of-00005.safetensors",
+        "blocks.21.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
+        "blocks.21.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
+        "blocks.21.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.21.mlp_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.22.attn.attn_key.weight": "model-00003-of-00005.safetensors",
+        "blocks.22.attn.attn_query.weight": "model-00003-of-00005.safetensors",
+        "blocks.22.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.22.attn.attn_value.weight": "model-00003-of-00005.safetensors",
+        "blocks.22.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
+        "blocks.22.attn_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.22.embed_skip.a_g": "model-00003-of-00005.safetensors",
+        "blocks.22.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
+        "blocks.22.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
+        "blocks.22.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.22.mlp_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.23.attn.attn_key.weight": "model-00003-of-00005.safetensors",
+        "blocks.23.attn.attn_query.weight": "model-00003-of-00005.safetensors",
+        "blocks.23.attn.attn_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.23.attn.attn_value.weight": "model-00003-of-00005.safetensors",
+        "blocks.23.attn.head_gain.head_g": "model-00003-of-00005.safetensors",
+        "blocks.23.attn_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.23.embed_skip.a_g": "model-00003-of-00005.safetensors",
+        "blocks.23.mlp.mlp_gate.weight": "model-00003-of-00005.safetensors",
+        "blocks.23.mlp.mlp_linear.weight": "model-00003-of-00005.safetensors",
+        "blocks.23.mlp.mlp_resid.weight": "model-00003-of-00005.safetensors",
+        "blocks.23.mlp_gain.a_g": "model-00003-of-00005.safetensors",
+        "blocks.24.attn.attn_key.weight": "model-00003-of-00005.safetensors",
+        "blocks.24.attn.attn_query.weight": "model-00003-of-00005.safetensors",
+        "blocks.24.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.24.attn.attn_value.weight": "model-00003-of-00005.safetensors",
+        "blocks.24.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
+        "blocks.24.attn_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.24.embed_skip.a_g": "model-00004-of-00005.safetensors",
+        "blocks.24.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
+        "blocks.24.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
+        "blocks.24.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.24.mlp_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.25.attn.attn_key.weight": "model-00004-of-00005.safetensors",
+        "blocks.25.attn.attn_query.weight": "model-00004-of-00005.safetensors",
+        "blocks.25.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.25.attn.attn_value.weight": "model-00004-of-00005.safetensors",
+        "blocks.25.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
+        "blocks.25.attn_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.25.embed_skip.a_g": "model-00004-of-00005.safetensors",
+        "blocks.25.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
+        "blocks.25.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
+        "blocks.25.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.25.mlp_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.26.attn.attn_key.weight": "model-00004-of-00005.safetensors",
+        "blocks.26.attn.attn_query.weight": "model-00004-of-00005.safetensors",
+        "blocks.26.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.26.attn.attn_value.weight": "model-00004-of-00005.safetensors",
+        "blocks.26.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
+        "blocks.26.attn_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.26.embed_skip.a_g": "model-00004-of-00005.safetensors",
+        "blocks.26.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
+        "blocks.26.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
+        "blocks.26.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.26.mlp_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.27.attn.attn_key.weight": "model-00004-of-00005.safetensors",
+        "blocks.27.attn.attn_query.weight": "model-00004-of-00005.safetensors",
+        "blocks.27.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.27.attn.attn_value.weight": "model-00004-of-00005.safetensors",
+        "blocks.27.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
+        "blocks.27.attn_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.27.embed_skip.a_g": "model-00004-of-00005.safetensors",
+        "blocks.27.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
+        "blocks.27.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
+        "blocks.27.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.27.mlp_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.28.attn.attn_key.weight": "model-00004-of-00005.safetensors",
+        "blocks.28.attn.attn_query.weight": "model-00004-of-00005.safetensors",
+        "blocks.28.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.28.attn.attn_value.weight": "model-00004-of-00005.safetensors",
+        "blocks.28.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
+        "blocks.28.attn_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.28.embed_skip.a_g": "model-00004-of-00005.safetensors",
+        "blocks.28.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
+        "blocks.28.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
+        "blocks.28.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.28.mlp_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.29.attn.attn_key.weight": "model-00004-of-00005.safetensors",
+        "blocks.29.attn.attn_query.weight": "model-00004-of-00005.safetensors",
+        "blocks.29.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.29.attn.attn_value.weight": "model-00004-of-00005.safetensors",
+        "blocks.29.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
+        "blocks.29.attn_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.29.embed_skip.a_g": "model-00004-of-00005.safetensors",
+        "blocks.29.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
+        "blocks.29.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
+        "blocks.29.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.29.mlp_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.3.attn.attn_key.weight": "model-00001-of-00005.safetensors",
+        "blocks.3.attn.attn_query.weight": "model-00001-of-00005.safetensors",
+        "blocks.3.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.3.attn.attn_value.weight": "model-00001-of-00005.safetensors",
+        "blocks.3.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
+        "blocks.3.attn_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.3.embed_skip.a_g": "model-00001-of-00005.safetensors",
+        "blocks.3.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
+        "blocks.3.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
+        "blocks.3.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.3.mlp_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.30.attn.attn_key.weight": "model-00004-of-00005.safetensors",
+        "blocks.30.attn.attn_query.weight": "model-00004-of-00005.safetensors",
+        "blocks.30.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.30.attn.attn_value.weight": "model-00004-of-00005.safetensors",
+        "blocks.30.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
+        "blocks.30.attn_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.30.embed_skip.a_g": "model-00004-of-00005.safetensors",
+        "blocks.30.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
+        "blocks.30.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
+        "blocks.30.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.30.mlp_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.31.attn.attn_key.weight": "model-00004-of-00005.safetensors",
+        "blocks.31.attn.attn_query.weight": "model-00004-of-00005.safetensors",
+        "blocks.31.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.31.attn.attn_value.weight": "model-00004-of-00005.safetensors",
+        "blocks.31.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
+        "blocks.31.attn_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.31.embed_skip.a_g": "model-00004-of-00005.safetensors",
+        "blocks.31.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
+        "blocks.31.mlp.mlp_linear.weight": "model-00004-of-00005.safetensors",
+        "blocks.31.mlp.mlp_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.31.mlp_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.32.attn.attn_key.weight": "model-00004-of-00005.safetensors",
+        "blocks.32.attn.attn_query.weight": "model-00004-of-00005.safetensors",
+        "blocks.32.attn.attn_resid.weight": "model-00004-of-00005.safetensors",
+        "blocks.32.attn.attn_value.weight": "model-00004-of-00005.safetensors",
+        "blocks.32.attn.head_gain.head_g": "model-00004-of-00005.safetensors",
+        "blocks.32.attn_gain.a_g": "model-00004-of-00005.safetensors",
+        "blocks.32.embed_skip.a_g": "model-00005-of-00005.safetensors",
+        "blocks.32.mlp.mlp_gate.weight": "model-00004-of-00005.safetensors",
+        "blocks.32.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
+        "blocks.32.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.32.mlp_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.33.attn.attn_key.weight": "model-00005-of-00005.safetensors",
+        "blocks.33.attn.attn_query.weight": "model-00005-of-00005.safetensors",
+        "blocks.33.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.33.attn.attn_value.weight": "model-00005-of-00005.safetensors",
+        "blocks.33.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
+        "blocks.33.attn_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.33.embed_skip.a_g": "model-00005-of-00005.safetensors",
+        "blocks.33.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
+        "blocks.33.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
+        "blocks.33.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.33.mlp_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.34.attn.attn_key.weight": "model-00005-of-00005.safetensors",
+        "blocks.34.attn.attn_query.weight": "model-00005-of-00005.safetensors",
+        "blocks.34.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.34.attn.attn_value.weight": "model-00005-of-00005.safetensors",
+        "blocks.34.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
+        "blocks.34.attn_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.34.embed_skip.a_g": "model-00005-of-00005.safetensors",
+        "blocks.34.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
+        "blocks.34.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
+        "blocks.34.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.34.mlp_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.35.attn.attn_key.weight": "model-00005-of-00005.safetensors",
+        "blocks.35.attn.attn_query.weight": "model-00005-of-00005.safetensors",
+        "blocks.35.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.35.attn.attn_value.weight": "model-00005-of-00005.safetensors",
+        "blocks.35.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
+        "blocks.35.attn_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.35.embed_skip.a_g": "model-00005-of-00005.safetensors",
+        "blocks.35.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
+        "blocks.35.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
+        "blocks.35.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.35.mlp_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.36.attn.attn_key.weight": "model-00005-of-00005.safetensors",
+        "blocks.36.attn.attn_query.weight": "model-00005-of-00005.safetensors",
+        "blocks.36.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.36.attn.attn_value.weight": "model-00005-of-00005.safetensors",
+        "blocks.36.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
+        "blocks.36.attn_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.36.embed_skip.a_g": "model-00005-of-00005.safetensors",
+        "blocks.36.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
+        "blocks.36.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
+        "blocks.36.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.36.mlp_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.37.attn.attn_key.weight": "model-00005-of-00005.safetensors",
+        "blocks.37.attn.attn_query.weight": "model-00005-of-00005.safetensors",
+        "blocks.37.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.37.attn.attn_value.weight": "model-00005-of-00005.safetensors",
+        "blocks.37.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
+        "blocks.37.attn_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.37.embed_skip.a_g": "model-00005-of-00005.safetensors",
+        "blocks.37.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
+        "blocks.37.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
+        "blocks.37.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.37.mlp_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.38.attn.attn_key.weight": "model-00005-of-00005.safetensors",
+        "blocks.38.attn.attn_query.weight": "model-00005-of-00005.safetensors",
+        "blocks.38.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.38.attn.attn_value.weight": "model-00005-of-00005.safetensors",
+        "blocks.38.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
+        "blocks.38.attn_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.38.embed_skip.a_g": "model-00005-of-00005.safetensors",
+        "blocks.38.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
+        "blocks.38.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
+        "blocks.38.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.38.mlp_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.39.attn.attn_key.weight": "model-00005-of-00005.safetensors",
+        "blocks.39.attn.attn_query.weight": "model-00005-of-00005.safetensors",
+        "blocks.39.attn.attn_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.39.attn.attn_value.weight": "model-00005-of-00005.safetensors",
+        "blocks.39.attn.head_gain.head_g": "model-00005-of-00005.safetensors",
+        "blocks.39.attn_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.39.embed_skip.a_g": "model-00005-of-00005.safetensors",
+        "blocks.39.mlp.mlp_gate.weight": "model-00005-of-00005.safetensors",
+        "blocks.39.mlp.mlp_linear.weight": "model-00005-of-00005.safetensors",
+        "blocks.39.mlp.mlp_resid.weight": "model-00005-of-00005.safetensors",
+        "blocks.39.mlp_gain.a_g": "model-00005-of-00005.safetensors",
+        "blocks.4.attn.attn_key.weight": "model-00001-of-00005.safetensors",
+        "blocks.4.attn.attn_query.weight": "model-00001-of-00005.safetensors",
+        "blocks.4.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.4.attn.attn_value.weight": "model-00001-of-00005.safetensors",
+        "blocks.4.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
+        "blocks.4.attn_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.4.embed_skip.a_g": "model-00001-of-00005.safetensors",
+        "blocks.4.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
+        "blocks.4.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
+        "blocks.4.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.4.mlp_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.5.attn.attn_key.weight": "model-00001-of-00005.safetensors",
+        "blocks.5.attn.attn_query.weight": "model-00001-of-00005.safetensors",
+        "blocks.5.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.5.attn.attn_value.weight": "model-00001-of-00005.safetensors",
+        "blocks.5.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
+        "blocks.5.attn_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.5.embed_skip.a_g": "model-00001-of-00005.safetensors",
+        "blocks.5.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
+        "blocks.5.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
+        "blocks.5.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.5.mlp_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.6.attn.attn_key.weight": "model-00001-of-00005.safetensors",
+        "blocks.6.attn.attn_query.weight": "model-00001-of-00005.safetensors",
+        "blocks.6.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.6.attn.attn_value.weight": "model-00001-of-00005.safetensors",
+        "blocks.6.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
+        "blocks.6.attn_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.6.embed_skip.a_g": "model-00001-of-00005.safetensors",
+        "blocks.6.mlp.mlp_gate.weight": "model-00001-of-00005.safetensors",
+        "blocks.6.mlp.mlp_linear.weight": "model-00001-of-00005.safetensors",
+        "blocks.6.mlp.mlp_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.6.mlp_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.7.attn.attn_key.weight": "model-00001-of-00005.safetensors",
+        "blocks.7.attn.attn_query.weight": "model-00001-of-00005.safetensors",
+        "blocks.7.attn.attn_resid.weight": "model-00001-of-00005.safetensors",
+        "blocks.7.attn.attn_value.weight": "model-00001-of-00005.safetensors",
+        "blocks.7.attn.head_gain.head_g": "model-00001-of-00005.safetensors",
+        "blocks.7.attn_gain.a_g": "model-00001-of-00005.safetensors",
+        "blocks.7.embed_skip.a_g": "model-00002-of-00005.safetensors",
+        "blocks.7.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
+        "blocks.7.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
+        "blocks.7.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.7.mlp_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.8.attn.attn_key.weight": "model-00002-of-00005.safetensors",
+        "blocks.8.attn.attn_query.weight": "model-00002-of-00005.safetensors",
+        "blocks.8.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.8.attn.attn_value.weight": "model-00002-of-00005.safetensors",
+        "blocks.8.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
+        "blocks.8.attn_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.8.embed_skip.a_g": "model-00002-of-00005.safetensors",
+        "blocks.8.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
+        "blocks.8.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
+        "blocks.8.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.8.mlp_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.9.attn.attn_key.weight": "model-00002-of-00005.safetensors",
+        "blocks.9.attn.attn_query.weight": "model-00002-of-00005.safetensors",
+        "blocks.9.attn.attn_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.9.attn.attn_value.weight": "model-00002-of-00005.safetensors",
+        "blocks.9.attn.head_gain.head_g": "model-00002-of-00005.safetensors",
+        "blocks.9.attn_gain.a_g": "model-00002-of-00005.safetensors",
+        "blocks.9.embed_skip.a_g": "model-00002-of-00005.safetensors",
+        "blocks.9.mlp.mlp_gate.weight": "model-00002-of-00005.safetensors",
+        "blocks.9.mlp.mlp_linear.weight": "model-00002-of-00005.safetensors",
+        "blocks.9.mlp.mlp_resid.weight": "model-00002-of-00005.safetensors",
+        "blocks.9.mlp_gain.a_g": "model-00002-of-00005.safetensors",
+        "embed.weight": "model-00001-of-00005.safetensors",
+        "lm_head": "model-00005-of-00005.safetensors",
+        "lm_head_gain.w_g": "model-00005-of-00005.safetensors"
+    }
+}

talkie_mlx.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# Copyright 2026
+#
+# MLX implementation of talkie-lm/talkie-1930-13b-base.
+# This file is intentionally self-contained so an MLX model directory can load it
+# through config.json: {"model_file": "talkie_mlx.py"}.
+import math
+from dataclasses import dataclass
+from typing import Any, Optional
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_lm.models.base import BaseModelArgs, create_attention_mask
+from mlx_lm.models.base import scaled_dot_product_attention
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str = "talkie"
+    vocab_size: int = 65536
+    hidden_size: int = 5120
+    num_hidden_layers: int = 40
+    num_attention_heads: int = 40
+    intermediate_size: int = 13696
+    head_dim: int = 128
+    max_position_embeddings: int = 2048
+    rope_theta: float = 1_000_000.0
+    tie_word_embeddings: bool = False
+    rms_norm_eps: Optional[float] = 1.1920928955078125e-7
+def rms_norm(x: mx.array, eps: Optional[float] = None) -> mx.array:
+    if eps is None:
+        eps = mx.finfo(x.dtype).eps
+    return mx.fast.rms_norm(x, None, eps)
+def apply_talkie_rope(x: mx.array, offset: int, base: float) -> mx.array:
+    """Apply Talkie's split-half RoPE to tensors shaped [B, H, T, D]."""
+    head_dim = x.shape[-1]
+    half_dim = head_dim // 2
+    freqs = -mx.exp(
+        mx.arange(0.0, half_dim, dtype=mx.float32) * (math.log(base) / half_dim)
+    )
+    return mx.fast.rope(
+        x,
+        dims=head_dim,
+        traditional=False,
+        base=None,
+        freqs=freqs,
+        scale=1.0,
+        offset=offset,
+    )
+class HeadGain(nn.Module):
+    def __init__(self, num_heads: int):
+        super().__init__()
+        self.head_g = mx.ones((num_heads,), dtype=mx.float32)
+    def __call__(self, x: mx.array) -> mx.array:
+        return x * self.head_g.astype(x.dtype).reshape(1, -1, 1, 1)
+class WeightGain(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.w_g = mx.ones((1,), dtype=mx.float32)
+    def __call__(self, w: mx.array) -> mx.array:
+        return w * self.w_g.astype(w.dtype)
+class ActGain(nn.Module):
+    def __init__(self, init_value: float):
+        super().__init__()
+        self.a_g = mx.array([init_value], dtype=mx.float32)
+    def __call__(self, x: mx.array) -> mx.array:
+        return x * self.a_g.astype(x.dtype)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.n_head = args.num_attention_heads
+        self.head_dim = args.head_dim
+        self.rope_theta = args.rope_theta
+        self.rms_norm_eps = args.rms_norm_eps
+        self.scale = self.head_dim**-0.5
+        n_state = args.hidden_size
+        self.attn_query = nn.Linear(n_state, n_state, bias=False)
+        self.attn_key = nn.Linear(n_state, n_state, bias=False)
+        self.attn_value = nn.Linear(n_state, n_state, bias=False)
+        self.attn_resid = nn.Linear(n_state, n_state, bias=False)
+        self.head_gain = HeadGain(self.n_head)
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        bsz, seq_len, _ = x.shape
+        q = self.attn_query(x).reshape(bsz, seq_len, self.n_head, self.head_dim)
+        k = self.attn_key(x).reshape(bsz, seq_len, self.n_head, self.head_dim)
+        v = self.attn_value(x).reshape(bsz, seq_len, self.n_head, self.head_dim)
+        q = q.transpose(0, 2, 1, 3)
+        k = k.transpose(0, 2, 1, 3)
+        v = v.transpose(0, 2, 1, 3)
+        offset = cache.offset if cache is not None else 0
+        q = apply_talkie_rope(q, offset=offset, base=self.rope_theta)
+        k = apply_talkie_rope(k, offset=offset, base=self.rope_theta)
+        q = rms_norm(q, self.rms_norm_eps)
+        k = rms_norm(k, self.rms_norm_eps)
+        q = self.head_gain(q)
+        if cache is not None:
+            k, v = cache.update_and_fetch(k, v)
+        y = scaled_dot_product_attention(
+            q, k, v, cache=cache, scale=self.scale, mask=mask
+        )
+        y = y.transpose(0, 2, 1, 3).reshape(bsz, seq_len, -1)
+        return self.attn_resid(y)
+class MLP(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        n_state = args.hidden_size
+        n_mlp = args.intermediate_size
+        self.mlp_gate = nn.Linear(n_state, n_mlp, bias=False)
+        self.mlp_linear = nn.Linear(n_state, n_mlp, bias=False)
+        self.mlp_resid = nn.Linear(n_mlp, n_state, bias=False)
+    def __call__(self, x: mx.array) -> mx.array:
+        gate = self.mlp_gate(x)
+        x = gate * mx.sigmoid(gate) * self.mlp_linear(x)
+        return self.mlp_resid(x)
+class Block(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        init_gain = (2 * args.num_hidden_layers) ** -0.5
+        self.attn = CausalSelfAttention(args)
+        self.attn_gain = ActGain(init_gain)
+        self.mlp = MLP(args)
+        self.mlp_gain = ActGain(init_gain)
+        self.embed_skip = ActGain(0.0)
+        self.rms_norm_eps = args.rms_norm_eps
+    def __call__(
+        self,
+        e_x: mx.array,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        x = x + self.attn_gain(self.attn(rms_norm(x, self.rms_norm_eps), mask, cache))
+        x = x + self.mlp_gain(self.mlp(rms_norm(x, self.rms_norm_eps)))
+        x = x + self.embed_skip(e_x)
+        return x
+class Model(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.model_type = args.model_type
+        self.embed = nn.Embedding(args.vocab_size, args.hidden_size)
+        self.blocks = [Block(args) for _ in range(args.num_hidden_layers)]
+        self.lm_head = mx.zeros((args.vocab_size, args.hidden_size), dtype=mx.float32)
+        self.lm_head_gain = WeightGain()
+    def __call__(
+        self,
+        input_ids: mx.array,
+        cache: Optional[Any] = None,
+        input_embeddings: Optional[mx.array] = None,
+    ) -> mx.array:
+        if input_embeddings is not None:
+            x = input_embeddings
+        else:
+            x = self.embed(input_ids)
+        x = rms_norm(x, self.args.rms_norm_eps)
+        e_x = x
+        if cache is None:
+            cache = [None] * len(self.blocks)
+        mask = create_attention_mask(x, cache[0])
+        for block, c in zip(self.blocks, cache):
+            x = block(e_x, x, mask=mask, cache=c)
+        x = rms_norm(x, self.args.rms_norm_eps)
+        return x @ self.lm_head_gain(self.lm_head).T
+    @property
+    def layers(self):
+        return self.blocks

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "backend": "tokenizers",
+  "chat_template": "{%- set prelude = 'The following conversation took place between the HUMAN, and TALKIE - a mechanical mind imbued with the knowledge of the world and the ability to use human language - a \"thinking machine\". It is published here for the benefit of the public:' -%}{{- prelude -}}{%- for message in messages -%}{%- if message['role'] == 'user' -%}{{- '\\n\\nHUMAN:\\n\\n' + (message['content'] | trim) -}}{%- elif message['role'] == 'assistant' -%}{{- '\\n\\nTALKIE:\\n\\n' + (message['content'] | trim) -}}{%- elif message['role'] == 'system' -%}{{- '\\n\\n' + (message['content'] | trim) -}}{%- else -%}{{- raise_exception('Unsupported role: ' + message['role']) -}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt and (messages | length == 0 or messages[-1]['role'] != 'assistant') -%}{{- '\\n\\nTALKIE:\\n\\n' -}}{%- endif -%}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "is_local": true,
+  "model_max_length": 2048,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "TokenizersBackend"
+}