sanjay920 commited on
Commit
61cdb53
1 Parent(s): b6edd96

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. cal_data.safetensors +3 -0
  3. config.json +26 -0
  4. example1.png +0 -0
  5. example2.png +0 -0
  6. example3.png +0 -0
  7. generation_config.json +7 -0
  8. hidden_states.safetensors +3 -0
  9. job_new.json +0 -0
  10. measurement.json +0 -0
  11. model.safetensors.index.json +442 -0
  12. mtbench-comparison.png +0 -0
  13. needle-in-a-haystack.txt +898 -0
  14. out_tensor/lm_head.safetensors +3 -0
  15. out_tensor/model.layers.0.mlp.down_proj.safetensors +3 -0
  16. out_tensor/model.layers.0.mlp.gate_proj.safetensors +3 -0
  17. out_tensor/model.layers.0.mlp.up_proj.safetensors +3 -0
  18. out_tensor/model.layers.0.self_attn.k_proj.safetensors +3 -0
  19. out_tensor/model.layers.0.self_attn.o_proj.safetensors +3 -0
  20. out_tensor/model.layers.0.self_attn.q_proj.safetensors +3 -0
  21. out_tensor/model.layers.0.self_attn.v_proj.safetensors +3 -0
  22. out_tensor/model.layers.1.mlp.down_proj.safetensors +3 -0
  23. out_tensor/model.layers.1.mlp.gate_proj.safetensors +3 -0
  24. out_tensor/model.layers.1.mlp.up_proj.safetensors +3 -0
  25. out_tensor/model.layers.1.self_attn.k_proj.safetensors +3 -0
  26. out_tensor/model.layers.1.self_attn.o_proj.safetensors +3 -0
  27. out_tensor/model.layers.1.self_attn.q_proj.safetensors +3 -0
  28. out_tensor/model.layers.1.self_attn.v_proj.safetensors +3 -0
  29. out_tensor/model.layers.10.mlp.down_proj.safetensors +3 -0
  30. out_tensor/model.layers.10.mlp.gate_proj.safetensors +3 -0
  31. out_tensor/model.layers.10.mlp.up_proj.safetensors +3 -0
  32. out_tensor/model.layers.10.self_attn.k_proj.safetensors +3 -0
  33. out_tensor/model.layers.10.self_attn.o_proj.safetensors +3 -0
  34. out_tensor/model.layers.10.self_attn.q_proj.safetensors +3 -0
  35. out_tensor/model.layers.10.self_attn.v_proj.safetensors +3 -0
  36. out_tensor/model.layers.11.mlp.down_proj.safetensors +3 -0
  37. out_tensor/model.layers.11.mlp.gate_proj.safetensors +3 -0
  38. out_tensor/model.layers.11.mlp.up_proj.safetensors +3 -0
  39. out_tensor/model.layers.11.self_attn.k_proj.safetensors +3 -0
  40. out_tensor/model.layers.11.self_attn.o_proj.safetensors +3 -0
  41. out_tensor/model.layers.11.self_attn.q_proj.safetensors +3 -0
  42. out_tensor/model.layers.11.self_attn.v_proj.safetensors +3 -0
  43. out_tensor/model.layers.12.mlp.down_proj.safetensors +3 -0
  44. out_tensor/model.layers.12.mlp.gate_proj.safetensors +3 -0
  45. out_tensor/model.layers.12.mlp.up_proj.safetensors +3 -0
  46. out_tensor/model.layers.12.self_attn.k_proj.safetensors +3 -0
  47. out_tensor/model.layers.12.self_attn.o_proj.safetensors +3 -0
  48. out_tensor/model.layers.12.self_attn.q_proj.safetensors +3 -0
  49. out_tensor/model.layers.12.self_attn.v_proj.safetensors +3 -0
  50. out_tensor/model.layers.13.mlp.down_proj.safetensors +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ rubra-11b-h/rubra-11b-h.png filter=lfs diff=lfs merge=lfs -text
37
+ rubra-11b-h.png filter=lfs diff=lfs merge=lfs -text
cal_data.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08be1103ff8fcef33b570f3c0f5ae4cc7f9dc5c3f264105baa55fc9b132ed1be
3
+ size 1638488
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sanjay920/rubra-11b-h",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 4096,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 14336,
13
+ "max_position_embeddings": 32768,
14
+ "model_type": "mistral",
15
+ "num_attention_heads": 32,
16
+ "num_hidden_layers": 48,
17
+ "num_key_value_heads": 8,
18
+ "rms_norm_eps": 1e-05,
19
+ "rope_theta": 1000000.0,
20
+ "sliding_window": null,
21
+ "tie_word_embeddings": false,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.38.2",
24
+ "use_cache": false,
25
+ "vocab_size": 32000
26
+ }
example1.png ADDED
example2.png ADDED
example3.png ADDED
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.38.2",
6
+ "use_cache": false
7
+ }
hidden_states.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:159cc384aca646411e995e4443b10ad57f1d7e6520cf9a5903c6203bd52192cc
3
+ size 1677730376
job_new.json ADDED
The diff for this file is too large to render. See raw diff
 
measurement.json ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 24952840192
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00006-of-00006.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00006.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00006.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00006.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00006.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00006.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00006.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00006.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00006.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00006.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00006.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00006.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00006.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00006.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00006.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00006.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00006.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00006.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00006.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00006.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00006.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00006.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00006.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00006.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00004-of-00006.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00006.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00004-of-00006.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00006.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
242
+ "model.layers.32.input_layernorm.weight": "model-00004-of-00006.safetensors",
243
+ "model.layers.32.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
244
+ "model.layers.32.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
245
+ "model.layers.32.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
246
+ "model.layers.32.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
247
+ "model.layers.32.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
248
+ "model.layers.32.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
249
+ "model.layers.32.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
250
+ "model.layers.32.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
251
+ "model.layers.33.input_layernorm.weight": "model-00004-of-00006.safetensors",
252
+ "model.layers.33.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
253
+ "model.layers.33.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
254
+ "model.layers.33.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
255
+ "model.layers.33.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
256
+ "model.layers.33.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
257
+ "model.layers.33.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
258
+ "model.layers.33.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
259
+ "model.layers.33.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
260
+ "model.layers.34.input_layernorm.weight": "model-00004-of-00006.safetensors",
261
+ "model.layers.34.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
262
+ "model.layers.34.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
263
+ "model.layers.34.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
264
+ "model.layers.34.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
265
+ "model.layers.34.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
266
+ "model.layers.34.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
267
+ "model.layers.34.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
268
+ "model.layers.34.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
269
+ "model.layers.35.input_layernorm.weight": "model-00004-of-00006.safetensors",
270
+ "model.layers.35.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
271
+ "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
272
+ "model.layers.35.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
273
+ "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
274
+ "model.layers.35.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
275
+ "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
276
+ "model.layers.35.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
277
+ "model.layers.35.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
278
+ "model.layers.36.input_layernorm.weight": "model-00004-of-00006.safetensors",
279
+ "model.layers.36.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
280
+ "model.layers.36.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
281
+ "model.layers.36.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
282
+ "model.layers.36.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
283
+ "model.layers.36.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
284
+ "model.layers.36.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
285
+ "model.layers.36.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
286
+ "model.layers.36.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
287
+ "model.layers.37.input_layernorm.weight": "model-00004-of-00006.safetensors",
288
+ "model.layers.37.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
289
+ "model.layers.37.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
290
+ "model.layers.37.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
291
+ "model.layers.37.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
292
+ "model.layers.37.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
293
+ "model.layers.37.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
294
+ "model.layers.37.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
295
+ "model.layers.37.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
296
+ "model.layers.38.input_layernorm.weight": "model-00005-of-00006.safetensors",
297
+ "model.layers.38.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
298
+ "model.layers.38.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
299
+ "model.layers.38.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
300
+ "model.layers.38.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
301
+ "model.layers.38.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
302
+ "model.layers.38.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
303
+ "model.layers.38.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
304
+ "model.layers.38.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
305
+ "model.layers.39.input_layernorm.weight": "model-00005-of-00006.safetensors",
306
+ "model.layers.39.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
307
+ "model.layers.39.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
308
+ "model.layers.39.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
309
+ "model.layers.39.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
310
+ "model.layers.39.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
311
+ "model.layers.39.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
312
+ "model.layers.39.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
313
+ "model.layers.39.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
314
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00006.safetensors",
315
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
316
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
317
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
318
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
319
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
320
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
321
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
322
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
323
+ "model.layers.40.input_layernorm.weight": "model-00005-of-00006.safetensors",
324
+ "model.layers.40.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
325
+ "model.layers.40.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
326
+ "model.layers.40.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
327
+ "model.layers.40.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
328
+ "model.layers.40.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
329
+ "model.layers.40.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
330
+ "model.layers.40.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
331
+ "model.layers.40.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
332
+ "model.layers.41.input_layernorm.weight": "model-00005-of-00006.safetensors",
333
+ "model.layers.41.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
334
+ "model.layers.41.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
335
+ "model.layers.41.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
336
+ "model.layers.41.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
337
+ "model.layers.41.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
338
+ "model.layers.41.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
339
+ "model.layers.41.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
340
+ "model.layers.41.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
341
+ "model.layers.42.input_layernorm.weight": "model-00005-of-00006.safetensors",
342
+ "model.layers.42.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
343
+ "model.layers.42.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
344
+ "model.layers.42.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
345
+ "model.layers.42.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
346
+ "model.layers.42.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
347
+ "model.layers.42.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
348
+ "model.layers.42.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
349
+ "model.layers.42.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
350
+ "model.layers.43.input_layernorm.weight": "model-00005-of-00006.safetensors",
351
+ "model.layers.43.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
352
+ "model.layers.43.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
353
+ "model.layers.43.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
354
+ "model.layers.43.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
355
+ "model.layers.43.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
356
+ "model.layers.43.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
357
+ "model.layers.43.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
358
+ "model.layers.43.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
359
+ "model.layers.44.input_layernorm.weight": "model-00005-of-00006.safetensors",
360
+ "model.layers.44.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
361
+ "model.layers.44.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
362
+ "model.layers.44.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
363
+ "model.layers.44.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
364
+ "model.layers.44.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
365
+ "model.layers.44.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
366
+ "model.layers.44.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
367
+ "model.layers.44.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
368
+ "model.layers.45.input_layernorm.weight": "model-00005-of-00006.safetensors",
369
+ "model.layers.45.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
370
+ "model.layers.45.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
371
+ "model.layers.45.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
372
+ "model.layers.45.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
373
+ "model.layers.45.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
374
+ "model.layers.45.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
375
+ "model.layers.45.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
376
+ "model.layers.45.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
377
+ "model.layers.46.input_layernorm.weight": "model-00005-of-00006.safetensors",
378
+ "model.layers.46.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
379
+ "model.layers.46.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
380
+ "model.layers.46.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
381
+ "model.layers.46.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
382
+ "model.layers.46.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
383
+ "model.layers.46.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
384
+ "model.layers.46.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
385
+ "model.layers.46.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
386
+ "model.layers.47.input_layernorm.weight": "model-00005-of-00006.safetensors",
387
+ "model.layers.47.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
388
+ "model.layers.47.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
389
+ "model.layers.47.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
390
+ "model.layers.47.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
391
+ "model.layers.47.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
392
+ "model.layers.47.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
393
+ "model.layers.47.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
394
+ "model.layers.47.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
395
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00006.safetensors",
396
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
397
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
398
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
399
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
400
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
401
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
402
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
403
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
404
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00006.safetensors",
405
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
406
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
407
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
408
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
409
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
410
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
411
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
412
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
413
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00006.safetensors",
414
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
415
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
416
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
417
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
418
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
419
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
420
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
421
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
422
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00006.safetensors",
423
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
424
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
425
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
426
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
427
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
428
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
429
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
430
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
431
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00006.safetensors",
432
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
433
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
434
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
435
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
436
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
437
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
438
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
439
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
440
+ "model.norm.weight": "model-00005-of-00006.safetensors"
441
+ }
442
+ }
mtbench-comparison.png ADDED
needle-in-a-haystack.txt ADDED
@@ -0,0 +1,898 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ what is the random number?
2
+ ```
3
+ May 2006(This essay is derived from a keynote at Xtech.)Could you reproduce Silicon Valley elsewhere, or is there something
4
+ unique about it?It wouldn't be surprising if it were hard to reproduce in other
5
+ countries, because you couldn't reproduce it in most of the US
6
+ either. What does it take to make a silicon valley even here?What it takes is the right people. If you could get the right ten
7
+ thousand people to move from Silicon Valley to Buffalo, Buffalo
8
+ would become Silicon Valley.
9
+ [1]That's a striking departure from the past. Up till a couple decades
10
+ ago, geography was destiny for cities. All great cities were located
11
+ on waterways, because cities made money by trade, and water was the
12
+ only economical way to ship.Now you could make a great city anywhere, if you could get the right
13
+ people to move there. So the question of how to make a silicon
14
+ valley becomes: who are the right people, and how do you get them
15
+ to move?Two TypesI think you only need two kinds of people to create a technology
16
+ hub: rich people and nerds. They're the limiting reagents in the
17
+ reaction that produces startups, because they're the only ones
18
+ present when startups get started. Everyone else will move.Observation bears this out: within the US, towns have become startup
19
+ hubs if and only if they have both rich people and nerds. Few
20
+ startups happen in Miami, for example, because although it's full
21
+ of rich people, it has few nerds. It's not the kind of place nerds
22
+ like.Whereas Pittsburgh has the opposite problem: plenty of nerds, but
23
+ no rich people. The top US Computer Science departments are said
24
+ to be MIT, Stanford, Berkeley, and Carnegie-Mellon. MIT yielded
25
+ Route 128. Stanford and Berkeley yielded Silicon Valley. But
26
+ Carnegie-Mellon? The record skips at that point. Lower down the
27
+ list, the University of Washington yielded a high-tech community
28
+ in Seattle, and the University of Texas at Austin yielded one in
29
+ Austin. But what happened in Pittsburgh? And in Ithaca, home of
30
+ Cornell, which is also high on the list?I grew up in Pittsburgh and went to college at Cornell, so I can
31
+ answer for both. The weather is terrible, particularly in winter,
32
+ and there's no interesting old city to make up for it, as there is
33
+ in Boston. Rich people don't want to live in Pittsburgh or Ithaca.
34
+ So while there are plenty of hackers who could start startups,
35
+ there's no one to invest in them.Not BureaucratsDo you really need the rich people? Wouldn't it work to have the
36
+ government invest in the nerds? No, it would not. Startup investors
37
+ are a distinct type of rich people. They tend to have a lot of
38
+ experience themselves in the technology business. This (a) helps
39
+ them pick the right startups, and (b) means they can supply advice
40
+ and connections as well as money. And the fact that they have a
41
+ personal stake in the outcome makes them really pay attention.Bureaucrats by their nature are the exact opposite sort of people
42
+ from startup investors. The idea of them making startup investments
43
+ is comic. It would be like mathematicians running Vogue-- or
44
+ perhaps more accurately, Vogue editors running a math journal.
45
+ [2]Though indeed, most things bureaucrats do, they do badly. We just
46
+ don't notice usually, because they only have to compete against
47
+ other bureaucrats. But as startup investors they'd have to compete
48
+ against pros with a great deal more experience and motivation.Even corporations that have in-house VC groups generally forbid
49
+ them to make their own investment decisions. Most are only allowed
50
+ to invest in deals where some reputable private VC firm is willing
51
+ to act as lead investor.Not BuildingsIf you go to see Silicon Valley, what you'll see are buildings.
52
+ But it's the people that make it Silicon Valley, not the buildings.
53
+ I read occasionally about attempts to set up "technology
54
+ parks" in other places, as if the active ingredient of Silicon
55
+ Valley were the office space. An article about Sophia Antipolis
56
+ bragged that companies there included Cisco, Compaq, IBM, NCR, and
57
+ Nortel. Don't the French realize these aren't startups?Building office buildings for technology companies won't get you a
58
+ silicon valley, because the key stage in the life of a startup
59
+ happens before they want that kind of space. The key stage is when
60
+ they're three guys operating out of an apartment. Wherever the
61
+ startup is when it gets funded, it will stay. The defining quality
62
+ of Silicon Valley is not that Intel or Apple or Google have offices
63
+ there, but that they were started there.So if you want to reproduce Silicon Valley, what you need to reproduce
64
+ is those two or three founders sitting around a kitchen table
65
+ deciding to start a company. And to reproduce that you need those
66
+ people.UniversitiesThe exciting thing is, all you need are the people. If you could
67
+ attract a critical mass of nerds and investors to live somewhere,
68
+ you could reproduce Silicon Valley. And both groups are highly
69
+ mobile. They'll go where life is good. So what makes a place good
70
+ to them?What nerds like is other nerds. Smart people will go wherever other
71
+ smart people are. And in particular, to great universities. In
72
+ theory there could be other ways to attract them, but so far
73
+ universities seem to be indispensable. Within the US, there are
74
+ no technology hubs without first-rate universities-- or at least,
75
+ first-rate computer science departments.So if you want to make a silicon valley, you not only need a
76
+ university, but one of the top handful in the world. It has to be
77
+ good enough to act as a magnet, drawing the best people from thousands
78
+ of miles away. And that means it has to stand up to existing magnets
79
+ like MIT and Stanford.This sounds hard. Actually it might be easy. My professor friends,
80
+ when they're deciding where they'd like to work, consider one thing
81
+ above all: the quality of the other faculty. What attracts professors
82
+ is good colleagues. So if you managed to recruit, en masse, a
83
+ significant number of the best young researchers, you could create
84
+ a first-rate university from nothing overnight. And you could do
85
+ that for surprisingly little. If you paid 200 people hiring bonuses
86
+ of $3 million apiece, you could put together a faculty that would
87
+ bear comparison with any in the world. And from that point the
88
+ chain reaction would be self-sustaining. So whatever it costs to
89
+ establish a mediocre university, for an additional half billion or
90
+ so you could have a great one.
91
+ Random number is 23.
92
+ [3]PersonalityHowever, merely creating a new university would not be enough to
93
+ start a silicon valley. The university is just the seed. It has
94
+ to be planted in the right soil, or it won't germinate. Plant it
95
+ in the wrong place, and you just create Carnegie-Mellon.To spawn startups, your university has to be in a town that has
96
+ attractions other than the university. It has to be a place where
97
+ investors want to live, and students want to stay after they graduate.The two like much the same things, because most startup investors
98
+ are nerds themselves. So what do nerds look for in a town? Their
99
+ tastes aren't completely different from other people's, because a
100
+ lot of the towns they like most in the US are also big tourist
101
+ destinations: San Francisco, Boston, Seattle. But their tastes
102
+ can't be quite mainstream either, because they dislike other big
103
+ tourist destinations, like New York, Los Angeles, and Las Vegas.There has been a lot written lately about the "creative class." The
104
+ thesis seems to be that as wealth derives increasingly from ideas,
105
+ cities will prosper only if they attract those who have them. That
106
+ is certainly true; in fact it was the basis of Amsterdam's prosperity
107
+ 400 years ago.A lot of nerd tastes they share with the creative class in general.
108
+ For example, they like well-preserved old neighborhoods instead of
109
+ cookie-cutter suburbs, and locally-owned shops and restaurants
110
+ instead of national chains. Like the rest of the creative class,
111
+ they want to live somewhere with personality.What exactly is personality? I think it's the feeling that each
112
+ building is the work of a distinct group of people. A town with
113
+ personality is one that doesn't feel mass-produced. So if you want
114
+ to make a startup hub-- or any town to attract the "creative class"--
115
+ you probably have to ban large development projects.
116
+ When a large tract has been developed by a single organization, you
117
+ can always tell.
118
+ [4]Most towns with personality are old, but they don't have to be.
119
+ Old towns have two advantages: they're denser, because they were
120
+ laid out before cars, and they're more varied, because they were
121
+ built one building at a time. You could have both now. Just have
122
+ building codes that ensure density, and ban large scale developments.A corollary is that you have to keep out the biggest developer of
123
+ all: the government. A government that asks "How can we build a
124
+ silicon valley?" has probably ensured failure by the way they framed
125
+ the question. You don't build a silicon valley; you let one grow.NerdsIf you want to attract nerds, you need more than a town with
126
+ personality. You need a town with the right personality. Nerds
127
+ are a distinct subset of the creative class, with different tastes
128
+ from the rest. You can see this most clearly in New York, which
129
+ attracts a lot of creative people, but few nerds.
130
+ [5]What nerds like is the kind of town where people walk around smiling.
131
+ This excludes LA, where no one walks at all, and also New York,
132
+ where people walk, but not smiling. When I was in grad school in
133
+ Boston, a friend came to visit from New York. On the subway back
134
+ from the airport she asked "Why is everyone smiling?" I looked and
135
+ they weren't smiling. They just looked like they were compared to
136
+ the facial expressions she was used to.If you've lived in New York, you know where these facial expressions
137
+ come from. It's the kind of place where your mind may be excited,
138
+ but your body knows it's having a bad time. People don't so much
139
+ enjoy living there as endure it for the sake of the excitement.
140
+ And if you like certain kinds of excitement, New York is incomparable.
141
+ It's a hub of glamour, a magnet for all the shorter half-life
142
+ isotopes of style and fame.Nerds don't care about glamour, so to them the appeal of New York
143
+ is a mystery. People who like New York will pay a fortune for a
144
+ small, dark, noisy apartment in order to live in a town where the
145
+ cool people are really cool. A nerd looks at that deal and sees
146
+ only: pay a fortune for a small, dark, noisy apartment.Nerds will pay a premium to live in a town where the smart people
147
+ are really smart, but you don't have to pay as much for that. It's
148
+ supply and demand: glamour is popular, so you have to pay a lot for
149
+ it.Most nerds like quieter pleasures. They like cafes instead of
150
+ clubs; used bookshops instead of fashionable clothing shops; hiking
151
+ instead of dancing; sunlight instead of tall buildings. A nerd's
152
+ idea of paradise is Berkeley or Boulder.YouthIt's the young nerds who start startups, so it's those specifically
153
+ the city has to appeal to. The startup hubs in the US are all
154
+ young-feeling towns. This doesn't mean they have to be new.
155
+ Cambridge has the oldest town plan in America, but it feels young
156
+ because it's full of students.What you can't have, if you want to create a silicon valley, is a
157
+ large, existing population of stodgy people. It would be a waste
158
+ of time to try to reverse the fortunes of a declining industrial town
159
+ like Detroit or Philadelphia by trying to encourage startups. Those
160
+ places have too much momentum in the wrong direction. You're better
161
+ off starting with a blank slate in the form of a small town. Or
162
+ better still, if there's a town young people already flock to, that
163
+ one.The Bay Area was a magnet for the young and optimistic for decades
164
+ before it was associated with technology. It was a place people
165
+ went in search of something new. And so it became synonymous with
166
+ California nuttiness. There's still a lot of that there. If you
167
+ wanted to start a new fad-- a new way to focus one's "energy," for
168
+ example, or a new category of things not to eat-- the Bay Area would
169
+ be the place to do it. But a place that tolerates oddness in the
170
+ search for the new is exactly what you want in a startup hub, because
171
+ economically that's what startups are. Most good startup ideas
172
+ seem a little crazy; if they were obviously good ideas, someone
173
+ would have done them already.(How many people are going to want computers in their houses?
174
+ What, another search engine?)That's the connection between technology and liberalism. Without
175
+ exception the high-tech cities in the US are also the most liberal.
176
+ But it's not because liberals are smarter that this is so. It's
177
+ because liberal cities tolerate odd ideas, and smart people by
178
+ definition have odd ideas.Conversely, a town that gets praised for being "solid" or representing
179
+ "traditional values" may be a fine place to live, but it's never
180
+ going to succeed as a startup hub. The 2004 presidential election,
181
+ though a disaster in other respects, conveniently supplied us with
182
+ a county-by-county
183
+ map of such places.
184
+ [6]To attract the young, a town must have an intact center. In most
185
+ American cities the center has been abandoned, and the growth, if
186
+ any, is in the suburbs. Most American cities have been turned
187
+ inside out. But none of the startup hubs has: not San Francisco,
188
+ or Boston, or Seattle. They all have intact centers.
189
+ [7]
190
+ My guess is that no city with a dead center could be turned into a
191
+ startup hub. Young people don't want to live in the suburbs.Within the US, the two cities I think could most easily be turned
192
+ into new silicon valleys are Boulder and Portland. Both have the
193
+ kind of effervescent feel that attracts the young. They're each
194
+ only a great university short of becoming a silicon valley, if they
195
+ wanted to.TimeA great university near an attractive town. Is that all it takes?
196
+ That was all it took to make the original Silicon Valley. Silicon
197
+ Valley traces its origins to William Shockley, one of the inventors
198
+ of the transistor. He did the research that won him the Nobel Prize
199
+ at Bell Labs, but when he started his own company in 1956 he moved
200
+ to Palo Alto to do it. At the time that was an odd thing to do.
201
+ Why did he? Because he had grown up there and remembered how nice
202
+ it was. Now Palo Alto is suburbia, but then it was a charming
203
+ college town-- a charming college town with perfect weather and San
204
+ Francisco only an hour away.The companies that rule Silicon Valley now are all descended in
205
+ various ways from Shockley Semiconductor. Shockley was a difficult
206
+ man, and in 1957 his top people-- "the traitorous eight"-- left to
207
+ start a new company, Fairchild Semiconductor. Among them were
208
+ Gordon Moore and Robert Noyce, who went on to found Intel, and
209
+ Eugene Kleiner, who founded the VC firm Kleiner Perkins. Forty-two
210
+ years later, Kleiner Perkins funded Google, and the partner responsible
211
+ for the deal was John Doerr, who came to Silicon Valley in 1974 to
212
+ work for Intel.So although a lot of the newest companies in Silicon Valley don't
213
+ make anything out of silicon, there always seem to be multiple links
214
+ back to Shockley. There's a lesson here: startups beget startups.
215
+ People who work for startups start their own. People who get rich
216
+ from startups fund new ones. I suspect this kind of organic growth
217
+ is the only way to produce a startup hub, because it's the only way
218
+ to grow the expertise you need.That has two important implications. The first is that you need
219
+ time to grow a silicon valley. The university you could create in
220
+ a couple years, but the startup community around it has to grow
221
+ organically. The cycle time is limited by the time it takes a
222
+ company to succeed, which probably averages about five years.The other implication of the organic growth hypothesis is that you
223
+ can't be somewhat of a startup hub. You either have a self-sustaining
224
+ chain reaction, or not. Observation confirms this too: cities
225
+ either have a startup scene, or they don't. There is no middle
226
+ ground. Chicago has the third largest metropolitan area in America.
227
+ As source of startups it's negligible compared to Seattle, number 15.The good news is that the initial seed can be quite small. Shockley
228
+ Semiconductor, though itself not very successful, was big enough.
229
+ It brought a critical mass of experts in an important new technology
230
+ together in a place they liked enough to stay.CompetingOf course, a would-be silicon valley faces an obstacle the original
231
+ one didn't: it has to compete with Silicon Valley. Can that be
232
+ done? Probably.One of Silicon Valley's biggest advantages is its venture capital
233
+ firms. This was not a factor in Shockley's day, because VC funds
234
+ didn't exist. In fact, Shockley Semiconductor and Fairchild
235
+ Semiconductor were not startups at all in our sense. They were
236
+ subsidiaries-- of Beckman Instruments and Fairchild Camera and
237
+ Instrument respectively. Those companies were apparently willing
238
+ to establish subsidiaries wherever the experts wanted to live.Venture investors, however, prefer to fund startups within an hour's
239
+ drive. For one, they're more likely to notice startups nearby.
240
+ But when they do notice startups in other towns they prefer them
241
+ to move. They don't want to have to travel to attend board meetings,
242
+ and in any case the odds of succeeding are higher in a startup hub.The centralizing effect of venture firms is a double one: they cause
243
+ startups to form around them, and those draw in more startups through
244
+ acquisitions. And although the first may be weakening because it's
245
+ now so cheap to start some startups, the second seems as strong as ever.
246
+ Three of the most admired
247
+ "Web 2.0" companies were started outside the usual startup hubs,
248
+ but two of them have already been reeled in through acquisitions.Such centralizing forces make it harder for new silicon valleys to
249
+ get started. But by no means impossible. Ultimately power rests
250
+ with the founders. A startup with the best people will beat one
251
+ with funding from famous VCs, and a startup that was sufficiently
252
+ successful would never have to move. So a town that
253
+ could exert enough pull over the right people could resist and
254
+ perhaps even surpass Silicon Valley.For all its power, Silicon Valley has a great weakness: the paradise
255
+ Shockley found in 1956 is now one giant parking lot. San Francisco
256
+ and Berkeley are great, but they're forty miles away. Silicon
257
+ Valley proper is soul-crushing suburban sprawl. It
258
+ has fabulous weather, which makes it significantly better than the
259
+ soul-crushing sprawl of most other American cities. But a competitor
260
+ that managed to avoid sprawl would have real leverage. All a city
261
+ needs is to be the kind of place the next traitorous eight look at
262
+ and say "I want to stay here," and that would be enough to get the
263
+ chain reaction started.Notes[1]
264
+ It's interesting to consider how low this number could be
265
+ made. I suspect five hundred would be enough, even if they could
266
+ bring no assets with them. Probably just thirty, if I could pick them,
267
+ would be enough to turn Buffalo into a significant startup hub.[2]
268
+ Bureaucrats manage to allocate research funding moderately
269
+ well, but only because (like an in-house VC fund) they outsource
270
+ most of the work of selection. A professor at a famous university
271
+ who is highly regarded by his peers will get funding, pretty much
272
+ regardless of the proposal. That wouldn't work for startups, whose
273
+ founders aren't sponsored by organizations, and are often unknowns.[3]
274
+ You'd have to do it all at once, or at least a whole department
275
+ at a time, because people would be more likely to come if they
276
+ knew their friends were. And you should probably start from scratch,
277
+ rather than trying to upgrade an existing university, or much energy
278
+ would be lost in friction.[4]
279
+ Hypothesis: Any plan in which multiple independent buildings
280
+ are gutted or demolished to be "redeveloped" as a single project
281
+ is a net loss of personality for the city, with the exception of
282
+ the conversion of buildings not previously public, like warehouses.[5]
283
+ A few startups get started in New York, but less
284
+ than a tenth as many per capita as in Boston, and mostly
285
+ in less nerdy fields like finance and media.[6]
286
+ Some blue counties are false positives (reflecting the
287
+ remaining power of Democractic party machines), but there are no
288
+ false negatives. You can safely write off all the red counties.[7]
289
+ Some "urban renewal" experts took a shot at destroying Boston's
290
+ in the 1960s, leaving the area around city hall a bleak wasteland,
291
+ but most neighborhoods successfully resisted them.Thanks to Chris Anderson, Trevor Blackwell, Marc Hedlund,
292
+ Jessica Livingston, Robert Morris, Greg Mcadoo, Fred Wilson,
293
+ and Stephen Wolfram for
294
+ reading drafts of this, and to Ed Dumbill for inviting me to speak.(The second part of this talk became Why Startups
295
+ Condense in America.)
296
+ May 2001(This article was written as a kind of business plan for a
297
+ new language.
298
+ So it is missing (because it takes for granted) the most important
299
+ feature of a good programming language: very powerful abstractions.)A friend of mine once told an eminent operating systems
300
+ expert that he wanted to design a really good
301
+ programming language. The expert told him that it would be a
302
+ waste of time, that programming languages don't become popular
303
+ or unpopular based on their merits, and so no matter how
304
+ good his language was, no one would use it. At least, that
305
+ was what had happened to the language he had designed.What does make a language popular? Do popular
306
+ languages deserve their popularity? Is it worth trying to
307
+ define a good programming language? How would you do it?I think the answers to these questions can be found by looking
308
+ at hackers, and learning what they want. Programming
309
+ languages are for hackers, and a programming language
310
+ is good as a programming language (rather than, say, an
311
+ exercise in denotational semantics or compiler design)
312
+ if and only if hackers like it.1 The Mechanics of PopularityIt's true, certainly, that most people don't choose programming
313
+ languages simply based on their merits. Most programmers are told
314
+ what language to use by someone else. And yet I think the effect
315
+ of such external factors on the popularity of programming languages
316
+ is not as great as it's sometimes thought to be. I think a bigger
317
+ problem is that a hacker's idea of a good programming language is
318
+ not the same as most language designers'.Between the two, the hacker's opinion is the one that matters.
319
+ Programming languages are not theorems. They're tools, designed
320
+ for people, and they have to be designed to suit human strengths
321
+ and weaknesses as much as shoes have to be designed for human feet.
322
+ If a shoe pinches when you put it on, it's a bad shoe, however
323
+ elegant it may be as a piece of sculpture.It may be that the majority of programmers can't tell a good language
324
+ from a bad one. But that's no different with any other tool. It
325
+ doesn't mean that it's a waste of time to try designing a good
326
+ language. Expert hackers
327
+ can tell a good language when they see
328
+ one, and they'll use it. Expert hackers are a tiny minority,
329
+ admittedly, but that tiny minority write all the good software,
330
+ and their influence is such that the rest of the programmers will
331
+ tend to use whatever language they use. Often, indeed, it is not
332
+ merely influence but command: often the expert hackers are the very
333
+ people who, as their bosses or faculty advisors, tell the other
334
+ programmers what language to use.The opinion of expert hackers is not the only force that determines
335
+ the relative popularity of programming languages — legacy software
336
+ (Cobol) and hype (Ada, Java) also play a role — but I think it is
337
+ the most powerful force over the long term. Given an initial critical
338
+ mass and enough time, a programming language probably becomes about
339
+ as popular as it deserves to be. And popularity further separates
340
+ good languages from bad ones, because feedback from real live users
341
+ always leads to improvements. Look at how much any popular language
342
+ has changed during its life. Perl and Fortran are extreme cases,
343
+ but even Lisp has changed a lot. Lisp 1.5 didn't have macros, for
344
+ example; these evolved later, after hackers at MIT had spent a
345
+ couple years using Lisp to write real programs. [1]So whether or not a language has to be good to be popular, I think
346
+ a language has to be popular to be good. And it has to stay popular
347
+ to stay good. The state of the art in programming languages doesn't
348
+ stand still. And yet the Lisps we have today are still pretty much
349
+ what they had at MIT in the mid-1980s, because that's the last time
350
+ Lisp had a sufficiently large and demanding user base.Of course, hackers have to know about a language before they can
351
+ use it. How are they to hear? From other hackers. But there has to
352
+ be some initial group of hackers using the language for others even
353
+ to hear about it. I wonder how large this group has to be; how many
354
+ users make a critical mass? Off the top of my head, I'd say twenty.
355
+ If a language had twenty separate users, meaning twenty users who
356
+ decided on their own to use it, I'd consider it to be real.Getting there can't be easy. I would not be surprised if it is
357
+ harder to get from zero to twenty than from twenty to a thousand.
358
+ The best way to get those initial twenty users is probably to use
359
+ a trojan horse: to give people an application they want, which
360
+ happens to be written in the new language.2 External FactorsLet's start by acknowledging one external factor that does affect
361
+ the popularity of a programming language. To become popular, a
362
+ programming language has to be the scripting language of a popular
363
+ system. Fortran and Cobol were the scripting languages of early
364
+ IBM mainframes. C was the scripting language of Unix, and so, later,
365
+ was Perl. Tcl is the scripting language of Tk. Java and Javascript
366
+ are intended to be the scripting languages of web browsers.Lisp is not a massively popular language because it is not the
367
+ scripting language of a massively popular system. What popularity
368
+ it retains dates back to the 1960s and 1970s, when it was the
369
+ scripting language of MIT. A lot of the great programmers of the
370
+ day were associated with MIT at some point. And in the early 1970s,
371
+ before C, MIT's dialect of Lisp, called MacLisp, was one of the
372
+ only programming languages a serious hacker would want to use.Today Lisp is the scripting language of two moderately popular
373
+ systems, Emacs and Autocad, and for that reason I suspect that most
374
+ of the Lisp programming done today is done in Emacs Lisp or AutoLisp.Programming languages don't exist in isolation. To hack is a
375
+ transitive verb — hackers are usually hacking something — and in
376
+ practice languages are judged relative to whatever they're used to
377
+ hack. So if you want to design a popular language, you either have
378
+ to supply more than a language, or you have to design your language
379
+ to replace the scripting language of some existing system.Common Lisp is unpopular partly because it's an orphan. It did
380
+ originally come with a system to hack: the Lisp Machine. But Lisp
381
+ Machines (along with parallel computers) were steamrollered by the
382
+ increasing power of general purpose processors in the 1980s. Common
383
+ Lisp might have remained popular if it had been a good scripting
384
+ language for Unix. It is, alas, an atrociously bad one.One way to describe this situation is to say that a language isn't
385
+ judged on its own merits. Another view is that a programming language
386
+ really isn't a programming language unless it's also the scripting
387
+ language of something. This only seems unfair if it comes as a
388
+ surprise. I think it's no more unfair than expecting a programming
389
+ language to have, say, an implementation. It's just part of what
390
+ a programming language is.A programming language does need a good implementation, of course,
391
+ and this must be free. Companies will pay for software, but individual
392
+ hackers won't, and it's the hackers you need to attract.A language also needs to have a book about it. The book should be
393
+ thin, well-written, and full of good examples. K&R is the ideal
394
+ here. At the moment I'd almost say that a language has to have a
395
+ book published by O'Reilly. That's becoming the test of mattering
396
+ to hackers.There should be online documentation as well. In fact, the book
397
+ can start as online documentation. But I don't think that physical
398
+ books are outmoded yet. Their format is convenient, and the de
399
+ facto censorship imposed by publishers is a useful if imperfect
400
+ filter. Bookstores are one of the most important places for learning
401
+ about new languages.3 BrevityGiven that you can supply the three things any language needs — a
402
+ free implementation, a book, and something to hack — how do you
403
+ make a language that hackers will like?One thing hackers like is brevity. Hackers are lazy, in the same
404
+ way that mathematicians and modernist architects are lazy: they
405
+ hate anything extraneous. It would not be far from the truth to
406
+ say that a hacker about to write a program decides what language
407
+ to use, at least subconsciously, based on the total number of
408
+ characters he'll have to type. If this isn't precisely how hackers
409
+ think, a language designer would do well to act as if it were.It is a mistake to try to baby the user with long-winded expressions
410
+ that are meant to resemble English. Cobol is notorious for this
411
+ flaw. A hacker would consider being asked to writeadd x to y giving zinstead ofz = x+yas something between an insult to his intelligence and a sin against
412
+ God.It has sometimes been said that Lisp should use first and rest
413
+ instead of car and cdr, because it would make programs easier to
414
+ read. Maybe for the first couple hours. But a hacker can learn
415
+ quickly enough that car means the first element of a list and cdr
416
+ means the rest. Using first and rest means 50% more typing. And
417
+ they are also different lengths, meaning that the arguments won't
418
+ line up when they're called, as car and cdr often are, in successive
419
+ lines. I've found that it matters a lot how code lines up on the
420
+ page. I can barely read Lisp code when it is set in a variable-width
421
+ font, and friends say this is true for other languages too.Brevity is one place where strongly typed languages lose. All other
422
+ things being equal, no one wants to begin a program with a bunch
423
+ of declarations. Anything that can be implicit, should be.The individual tokens should be short as well. Perl and Common Lisp
424
+ occupy opposite poles on this question. Perl programs can be almost
425
+ cryptically dense, while the names of built-in Common Lisp operators
426
+ are comically long. The designers of Common Lisp probably expected
427
+ users to have text editors that would type these long names for
428
+ them. But the cost of a long name is not just the cost of typing
429
+ it. There is also the cost of reading it, and the cost of the space
430
+ it takes up on your screen.4 HackabilityThere is one thing more important than brevity to a hacker: being
431
+ able to do what you want. In the history of programming languages
432
+ a surprising amount of effort has gone into preventing programmers
433
+ from doing things considered to be improper. This is a dangerously
434
+ presumptuous plan. How can the language designer know what the
435
+ programmer is going to need to do? I think language designers would
436
+ do better to consider their target user to be a genius who will
437
+ need to do things they never anticipated, rather than a bumbler
438
+ who needs to be protected from himself. The bumbler will shoot
439
+ himself in the foot anyway. You may save him from referring to
440
+ variables in another package, but you can't save him from writing
441
+ a badly designed program to solve the wrong problem, and taking
442
+ forever to do it.Good programmers often want to do dangerous and unsavory things.
443
+ By unsavory I mean things that go behind whatever semantic facade
444
+ the language is trying to present: getting hold of the internal
445
+ representation of some high-level abstraction, for example. Hackers
446
+ like to hack, and hacking means getting inside things and second
447
+ guessing the original designer.Let yourself be second guessed. When you make any tool, people use
448
+ it in ways you didn't intend, and this is especially true of a
449
+ highly articulated tool like a programming language. Many a hacker
450
+ will want to tweak your semantic model in a way that you never
451
+ imagined. I say, let them; give the programmer access to as much
452
+ internal stuff as you can without endangering runtime systems like
453
+ the garbage collector.In Common Lisp I have often wanted to iterate through the fields
454
+ of a struct — to comb out references to a deleted object, for example,
455
+ or find fields that are uninitialized. I know the structs are just
456
+ vectors underneath. And yet I can't write a general purpose function
457
+ that I can call on any struct. I can only access the fields by
458
+ name, because that's what a struct is supposed to mean.A hacker may only want to subvert the intended model of things once
459
+ or twice in a big program. But what a difference it makes to be
460
+ able to. And it may be more than a question of just solving a
461
+ problem. There is a kind of pleasure here too. Hackers share the
462
+ surgeon's secret pleasure in poking about in gross innards, the
463
+ teenager's secret pleasure in popping zits. [2] For boys, at least,
464
+ certain kinds of horrors are fascinating. Maxim magazine publishes
465
+ an annual volume of photographs, containing a mix of pin-ups and
466
+ grisly accidents. They know their audience.Historically, Lisp has been good at letting hackers have their way.
467
+ The political correctness of Common Lisp is an aberration. Early
468
+ Lisps let you get your hands on everything. A good deal of that
469
+ spirit is, fortunately, preserved in macros. What a wonderful thing,
470
+ to be able to make arbitrary transformations on the source code.Classic macros are a real hacker's tool — simple, powerful, and
471
+ dangerous. It's so easy to understand what they do: you call a
472
+ function on the macro's arguments, and whatever it returns gets
473
+ inserted in place of the macro call. Hygienic macros embody the
474
+ opposite principle. They try to protect you from understanding what
475
+ they're doing. I have never heard hygienic macros explained in one
476
+ sentence. And they are a classic example of the dangers of deciding
477
+ what programmers are allowed to want. Hygienic macros are intended
478
+ to protect me from variable capture, among other things, but variable
479
+ capture is exactly what I want in some macros.A really good language should be both clean and dirty: cleanly
480
+ designed, with a small core of well understood and highly orthogonal
481
+ operators, but dirty in the sense that it lets hackers have their
482
+ way with it. C is like this. So were the early Lisps. A real hacker's
483
+ language will always have a slightly raffish character.A good programming language should have features that make the kind
484
+ of people who use the phrase "software engineering" shake their
485
+ heads disapprovingly. At the other end of the continuum are languages
486
+ like Ada and Pascal, models of propriety that are good for teaching
487
+ and not much else.5 Throwaway ProgramsTo be attractive to hackers, a language must be good for writing
488
+ the kinds of programs they want to write. And that means, perhaps
489
+ surprisingly, that it has to be good for writing throwaway programs.A throwaway program is a program you write quickly for some limited
490
+ task: a program to automate some system administration task, or
491
+ generate test data for a simulation, or convert data from one format
492
+ to another. The surprising thing about throwaway programs is that,
493
+ like the "temporary" buildings built at so many American universities
494
+ during World War II, they often don't get thrown away. Many evolve
495
+ into real programs, with real features and real users.I have a hunch that the best big programs begin life this way,
496
+ rather than being designed big from the start, like the Hoover Dam.
497
+ It's terrifying to build something big from scratch. When people
498
+ take on a project that's too big, they become overwhelmed. The
499
+ project either gets bogged down, or the result is sterile and
500
+ wooden: a shopping mall rather than a real downtown, Brasilia rather
501
+ than Rome, Ada rather than C.Another way to get a big program is to start with a throwaway
502
+ program and keep improving it. This approach is less daunting, and
503
+ the design of the program benefits from evolution. I think, if one
504
+ looked, that this would turn out to be the way most big programs
505
+ were developed. And those that did evolve this way are probably
506
+ still written in whatever language they were first written in,
507
+ because it's rare for a program to be ported, except for political
508
+ reasons. And so, paradoxically, if you want to make a language that
509
+ is used for big systems, you have to make it good for writing
510
+ throwaway programs, because that's where big systems come from.Perl is a striking example of this idea. It was not only designed
511
+ for writing throwaway programs, but was pretty much a throwaway
512
+ program itself. Perl began life as a collection of utilities for
513
+ generating reports, and only evolved into a programming language
514
+ as the throwaway programs people wrote in it grew larger. It was
515
+ not until Perl 5 (if then) that the language was suitable for
516
+ writing serious programs, and yet it was already massively popular.What makes a language good for throwaway programs? To start with,
517
+ it must be readily available. A throwaway program is something that
518
+ you expect to write in an hour. So the language probably must
519
+ already be installed on the computer you're using. It can't be
520
+ something you have to install before you use it. It has to be there.
521
+ C was there because it came with the operating system. Perl was
522
+ there because it was originally a tool for system administrators,
523
+ and yours had already installed it.Being available means more than being installed, though. An
524
+ interactive language, with a command-line interface, is more
525
+ available than one that you have to compile and run separately. A
526
+ popular programming language should be interactive, and start up
527
+ fast.Another thing you want in a throwaway program is brevity. Brevity
528
+ is always attractive to hackers, and never more so than in a program
529
+ they expect to turn out in an hour.6 LibrariesOf course the ultimate in brevity is to have the program already
530
+ written for you, and merely to call it. And this brings us to what
531
+ I think will be an increasingly important feature of programming
532
+ languages: library functions. Perl wins because it has large
533
+ libraries for manipulating strings. This class of library functions
534
+ are especially important for throwaway programs, which are often
535
+ originally written for converting or extracting data. Many Perl
536
+ programs probably begin as just a couple library calls stuck
537
+ together.I think a lot of the advances that happen in programming languages
538
+ in the next fifty years will have to do with library functions. I
539
+ think future programming languages will have libraries that are as
540
+ carefully designed as the core language. Programming language design
541
+ will not be about whether to make your language strongly or weakly
542
+ typed, or object oriented, or functional, or whatever, but about
543
+ how to design great libraries. The kind of language designers who
544
+ like to think about how to design type systems may shudder at this.
545
+ It's almost like writing applications! Too bad. Languages are for
546
+ programmers, and libraries are what programmers need.It's hard to design good libraries. It's not simply a matter of
547
+ writing a lot of code. Once the libraries get too big, it can
548
+ sometimes take longer to find the function you need than to write
549
+ the code yourself. Libraries need to be designed using a small set
550
+ of orthogonal operators, just like the core language. It ought to
551
+ be possible for the programmer to guess what library call will do
552
+ what he needs.Libraries are one place Common Lisp falls short. There are only
553
+ rudimentary libraries for manipulating strings, and almost none
554
+ for talking to the operating system. For historical reasons, Common
555
+ Lisp tries to pretend that the OS doesn't exist. And because you
556
+ can't talk to the OS, you're unlikely to be able to write a serious
557
+ program using only the built-in operators in Common Lisp. You have
558
+ to use some implementation-specific hacks as well, and in practice
559
+ these tend not to give you everything you want. Hackers would think
560
+ a lot more highly of Lisp if Common Lisp had powerful string
561
+ libraries and good OS support.7 SyntaxCould a language with Lisp's syntax, or more precisely, lack of
562
+ syntax, ever become popular? I don't know the answer to this
563
+ question. I do think that syntax is not the main reason Lisp isn't
564
+ currently popular. Common Lisp has worse problems than unfamiliar
565
+ syntax. I know several programmers who are comfortable with prefix
566
+ syntax and yet use Perl by default, because it has powerful string
567
+ libraries and can talk to the os.There are two possible problems with prefix notation: that it is
568
+ unfamiliar to programmers, and that it is not dense enough. The
569
+ conventional wisdom in the Lisp world is that the first problem is
570
+ the real one. I'm not so sure. Yes, prefix notation makes ordinary
571
+ programmers panic. But I don't think ordinary programmers' opinions
572
+ matter. Languages become popular or unpopular based on what expert
573
+ hackers think of them, and I think expert hackers might be able to
574
+ deal with prefix notation. Perl syntax can be pretty incomprehensible,
575
+ but that has not stood in the way of Perl's popularity. If anything
576
+ it may have helped foster a Perl cult.A more serious problem is the diffuseness of prefix notation. For
577
+ expert hackers, that really is a problem. No one wants to write
578
+ (aref a x y) when they could write a[x,y].In this particular case there is a way to finesse our way out of
579
+ the problem. If we treat data structures as if they were functions
580
+ on indexes, we could write (a x y) instead, which is even shorter
581
+ than the Perl form. Similar tricks may shorten other types of
582
+ expressions.We can get rid of (or make optional) a lot of parentheses by making
583
+ indentation significant. That's how programmers read code anyway:
584
+ when indentation says one thing and delimiters say another, we go
585
+ by the indentation. Treating indentation as significant would
586
+ eliminate this common source of bugs as well as making programs
587
+ shorter.Sometimes infix syntax is easier to read. This is especially true
588
+ for math expressions. I've used Lisp my whole programming life and
589
+ I still don't find prefix math expressions natural. And yet it is
590
+ convenient, especially when you're generating code, to have operators
591
+ that take any number of arguments. So if we do have infix syntax,
592
+ it should probably be implemented as some kind of read-macro.I don't think we should be religiously opposed to introducing syntax
593
+ into Lisp, as long as it translates in a well-understood way into
594
+ underlying s-expressions. There is already a good deal of syntax
595
+ in Lisp. It's not necessarily bad to introduce more, as long as no
596
+ one is forced to use it. In Common Lisp, some delimiters are reserved
597
+ for the language, suggesting that at least some of the designers
598
+ intended to have more syntax in the future.One of the most egregiously unlispy pieces of syntax in Common Lisp
599
+ occurs in format strings; format is a language in its own right,
600
+ and that language is not Lisp. If there were a plan for introducing
601
+ more syntax into Lisp, format specifiers might be able to be included
602
+ in it. It would be a good thing if macros could generate format
603
+ specifiers the way they generate any other kind of code.An eminent Lisp hacker told me that his copy of CLTL falls open to
604
+ the section format. Mine too. This probably indicates room for
605
+ improvement. It may also mean that programs do a lot of I/O.8 EfficiencyA good language, as everyone knows, should generate fast code. But
606
+ in practice I don't think fast code comes primarily from things
607
+ you do in the design of the language. As Knuth pointed out long
608
+ ago, speed only matters in certain critical bottlenecks. And as
609
+ many programmers have observed since, one is very often mistaken
610
+ about where these bottlenecks are.So, in practice, the way to get fast code is to have a very good
611
+ profiler, rather than by, say, making the language strongly typed.
612
+ You don't need to know the type of every argument in every call in
613
+ the program. You do need to be able to declare the types of arguments
614
+ in the bottlenecks. And even more, you need to be able to find out
615
+ where the bottlenecks are.One complaint people have had with Lisp is that it's hard to tell
616
+ what's expensive. This might be true. It might also be inevitable,
617
+ if you want to have a very abstract language. And in any case I
618
+ think good profiling would go a long way toward fixing the problem:
619
+ you'd soon learn what was expensive.Part of the problem here is social. Language designers like to
620
+ write fast compilers. That's how they measure their skill. They
621
+ think of the profiler as an add-on, at best. But in practice a good
622
+ profiler may do more to improve the speed of actual programs written
623
+ in the language than a compiler that generates fast code. Here,
624
+ again, language designers are somewhat out of touch with their
625
+ users. They do a really good job of solving slightly the wrong
626
+ problem.It might be a good idea to have an active profiler — to push
627
+ performance data to the programmer instead of waiting for him to
628
+ come asking for it. For example, the editor could display bottlenecks
629
+ in red when the programmer edits the source code. Another approach
630
+ would be to somehow represent what's happening in running programs.
631
+ This would be an especially big win in server-based applications,
632
+ where you have lots of running programs to look at. An active
633
+ profiler could show graphically what's happening in memory as a
634
+ program's running, or even make sounds that tell what's happening.Sound is a good cue to problems. In one place I worked, we had a
635
+ big board of dials showing what was happening to our web servers.
636
+ The hands were moved by little servomotors that made a slight noise
637
+ when they turned. I couldn't see the board from my desk, but I
638
+ found that I could tell immediately, by the sound, when there was
639
+ a problem with a server.It might even be possible to write a profiler that would automatically
640
+ detect inefficient algorithms. I would not be surprised if certain
641
+ patterns of memory access turned out to be sure signs of bad
642
+ algorithms. If there were a little guy running around inside the
643
+ computer executing our programs, he would probably have as long
644
+ and plaintive a tale to tell about his job as a federal government
645
+ employee. I often have a feeling that I'm sending the processor on
646
+ a lot of wild goose chases, but I've never had a good way to look
647
+ at what it's doing.A number of Lisps now compile into byte code, which is then executed
648
+ by an interpreter. This is usually done to make the implementation
649
+ easier to port, but it could be a useful language feature. It might
650
+ be a good idea to make the byte code an official part of the
651
+ language, and to allow programmers to use inline byte code in
652
+ bottlenecks. Then such optimizations would be portable too.The nature of speed, as perceived by the end-user, may be changing.
653
+ With the rise of server-based applications, more and more programs
654
+ may turn out to be i/o-bound. It will be worth making i/o fast.
655
+ The language can help with straightforward measures like simple,
656
+ fast, formatted output functions, and also with deep structural
657
+ changes like caching and persistent objects.Users are interested in response time. But another kind of efficiency
658
+ will be increasingly important: the number of simultaneous users
659
+ you can support per processor. Many of the interesting applications
660
+ written in the near future will be server-based, and the number of
661
+ users per server is the critical question for anyone hosting such
662
+ applications. In the capital cost of a business offering a server-based
663
+ application, this is the divisor.For years, efficiency hasn't mattered much in most end-user
664
+ applications. Developers have been able to assume that each user
665
+ would have an increasingly powerful processor sitting on their
666
+ desk. And by Parkinson's Law, software has expanded to use the
667
+ resources available. That will change with server-based applications.
668
+ In that world, the hardware and software will be supplied together.
669
+ For companies that offer server-based applications, it will make
670
+ a very big difference to the bottom line how many users they can
671
+ support per server.In some applications, the processor will be the limiting factor,
672
+ and execution speed will be the most important thing to optimize.
673
+ But often memory will be the limit; the number of simultaneous
674
+ users will be determined by the amount of memory you need for each
675
+ user's data. The language can help here too. Good support for
676
+ threads will enable all the users to share a single heap. It may
677
+ also help to have persistent objects and/or language level support
678
+ for lazy loading.9 TimeThe last ingredient a popular language needs is time. No one wants
679
+ to write programs in a language that might go away, as so many
680
+ programming languages do. So most hackers will tend to wait until
681
+ a language has been around for a couple years before even considering
682
+ using it.Inventors of wonderful new things are often surprised to discover
683
+ this, but you need time to get any message through to people. A
684
+ friend of mine rarely does anything the first time someone asks
685
+ him. He knows that people sometimes ask for things that they turn
686
+ out not to want. To avoid wasting his time, he waits till the third
687
+ or fourth time he's asked to do something; by then, whoever's asking
688
+ him may be fairly annoyed, but at least they probably really do
689
+ want whatever they're asking for.Most people have learned to do a similar sort of filtering on new
690
+ things they hear about. They don't even start paying attention
691
+ until they've heard about something ten times. They're perfectly
692
+ justified: the majority of hot new whatevers do turn out to be a
693
+ waste of time, and eventually go away. By delaying learning VRML,
694
+ I avoided having to learn it at all.So anyone who invents something new has to expect to keep repeating
695
+ their message for years before people will start to get it. We
696
+ wrote what was, as far as I know, the first web-server based
697
+ application, and it took us years to get it through to people that
698
+ it didn't have to be downloaded. It wasn't that they were stupid.
699
+ They just had us tuned out.The good news is, simple repetition solves the problem. All you
700
+ have to do is keep telling your story, and eventually people will
701
+ start to hear. It's not when people notice you're there that they
702
+ pay attention; it's when they notice you're still there.It's just as well that it usually takes a while to gain momentum.
703
+ Most technologies evolve a good deal even after they're first
704
+ launched — programming languages especially. Nothing could be better,
705
+ for a new techology, than a few years of being used only by a small
706
+ number of early adopters. Early adopters are sophisticated and
707
+ demanding, and quickly flush out whatever flaws remain in your
708
+ technology. When you only have a few users you can be in close
709
+ contact with all of them. And early adopters are forgiving when
710
+ you improve your system, even if this causes some breakage.There are two ways new technology gets introduced: the organic
711
+ growth method, and the big bang method. The organic growth method
712
+ is exemplified by the classic seat-of-the-pants underfunded garage
713
+ startup. A couple guys, working in obscurity, develop some new
714
+ technology. They launch it with no marketing and initially have
715
+ only a few (fanatically devoted) users. They continue to improve
716
+ the technology, and meanwhile their user base grows by word of
717
+ mouth. Before they know it, they're big.The other approach, the big bang method, is exemplified by the
718
+ VC-backed, heavily marketed startup. They rush to develop a product,
719
+ launch it with great publicity, and immediately (they hope) have
720
+ a large user base.Generally, the garage guys envy the big bang guys. The big bang
721
+ guys are smooth and confident and respected by the VCs. They can
722
+ afford the best of everything, and the PR campaign surrounding the
723
+ launch has the side effect of making them celebrities. The organic
724
+ growth guys, sitting in their garage, feel poor and unloved. And
725
+ yet I think they are often mistaken to feel sorry for themselves.
726
+ Organic growth seems to yield better technology and richer founders
727
+ than the big bang method. If you look at the dominant technologies
728
+ today, you'll find that most of them grew organically.This pattern doesn't only apply to companies. You see it in sponsored
729
+ research too. Multics and Common Lisp were big-bang projects, and
730
+ Unix and MacLisp were organic growth projects.10 Redesign"The best writing is rewriting," wrote E. B. White. Every good
731
+ writer knows this, and it's true for software too. The most important
732
+ part of design is redesign. Programming languages, especially,
733
+ don't get redesigned enough.To write good software you must simultaneously keep two opposing
734
+ ideas in your head. You need the young hacker's naive faith in
735
+ his abilities, and at the same time the veteran's skepticism. You
736
+ have to be able to think
737
+ how hard can it be? with one half of
738
+ your brain while thinking
739
+ it will never work with the other.The trick is to realize that there's no real contradiction here.
740
+ You want to be optimistic and skeptical about two different things.
741
+ You have to be optimistic about the possibility of solving the
742
+ problem, but skeptical about the value of whatever solution you've
743
+ got so far.People who do good work often think that whatever they're working
744
+ on is no good. Others see what they've done and are full of wonder,
745
+ but the creator is full of worry. This pattern is no coincidence:
746
+ it is the worry that made the work good.If you can keep hope and worry balanced, they will drive a project
747
+ forward the same way your two legs drive a bicycle forward. In the
748
+ first phase of the two-cycle innovation engine, you work furiously
749
+ on some problem, inspired by your confidence that you'll be able
750
+ to solve it. In the second phase, you look at what you've done in
751
+ the cold light of morning, and see all its flaws very clearly. But
752
+ as long as your critical spirit doesn't outweigh your hope, you'll
753
+ be able to look at your admittedly incomplete system, and think,
754
+ how hard can it be to get the rest of the way?, thereby continuing
755
+ the cycle.It's tricky to keep the two forces balanced. In young hackers,
756
+ optimism predominates. They produce something, are convinced it's
757
+ great, and never improve it. In old hackers, skepticism predominates,
758
+ and they won't even dare to take on ambitious projects.Anything you can do to keep the redesign cycle going is good. Prose
759
+ can be rewritten over and over until you're happy with it. But
760
+ software, as a rule, doesn't get redesigned enough. Prose has
761
+ readers, but software has users. If a writer rewrites an essay,
762
+ people who read the old version are unlikely to complain that their
763
+ thoughts have been broken by some newly introduced incompatibility.Users are a double-edged sword. They can help you improve your
764
+ language, but they can also deter you from improving it. So choose
765
+ your users carefully, and be slow to grow their number. Having
766
+ users is like optimization: the wise course is to delay it. Also,
767
+ as a general rule, you can at any given time get away with changing
768
+ more than you think. Introducing change is like pulling off a
769
+ bandage: the pain is a memory almost as soon as you feel it.Everyone knows that it's not a good idea to have a language designed
770
+ by a committee. Committees yield bad design. But I think the worst
771
+ danger of committees is that they interfere with redesign. It is
772
+ so much work to introduce changes that no one wants to bother.
773
+ Whatever a committee decides tends to stay that way, even if most
774
+ of the members don't like it.Even a committee of two gets in the way of redesign. This happens
775
+ particularly in the interfaces between pieces of software written
776
+ by two different people. To change the interface both have to agree
777
+ to change it at once. And so interfaces tend not to change at all,
778
+ which is a problem because they tend to be one of the most ad hoc
779
+ parts of any system.One solution here might be to design systems so that interfaces
780
+ are horizontal instead of vertical — so that modules are always
781
+ vertically stacked strata of abstraction. Then the interface will
782
+ tend to be owned by one of them. The lower of two levels will either
783
+ be a language in which the upper is written, in which case the
784
+ lower level will own the interface, or it will be a slave, in which
785
+ case the interface can be dictated by the upper level.11 LispWhat all this implies is that there is hope for a new Lisp. There
786
+ is hope for any language that gives hackers what they want, including
787
+ Lisp. I think we may have made a mistake in thinking that hackers
788
+ are turned off by Lisp's strangeness. This comforting illusion may
789
+ have prevented us from seeing the real problem with Lisp, or at
790
+ least Common Lisp, which is that it sucks for doing what hackers
791
+ want to do. A hacker's language needs powerful libraries and
792
+ something to hack. Common Lisp has neither. A hacker's language is
793
+ terse and hackable. Common Lisp is not.The good news is, it's not Lisp that sucks, but Common Lisp. If we
794
+ can develop a new Lisp that is a real hacker's language, I think
795
+ hackers will use it. They will use whatever language does the job.
796
+ All we have to do is make sure this new Lisp does some important
797
+ job better than other languages.History offers some encouragement. Over time, successive new
798
+ programming languages have taken more and more features from Lisp.
799
+ There is no longer much left to copy before the language you've
800
+ made is Lisp. The latest hot language, Python, is a watered-down
801
+ Lisp with infix syntax and no macros. A new Lisp would be a natural
802
+ step in this progression.I sometimes think that it would be a good marketing trick to call
803
+ it an improved version of Python. That sounds hipper than Lisp. To
804
+ many people, Lisp is a slow AI language with a lot of parentheses.
805
+ Fritz Kunze's official biography carefully avoids mentioning the
806
+ L-word. But my guess is that we shouldn't be afraid to call the
807
+ new Lisp Lisp. Lisp still has a lot of latent respect among the
808
+ very best hackers — the ones who took 6.001 and understood it, for
809
+ example. And those are the users you need to win.In "How to Become a Hacker," Eric Raymond describes Lisp as something
810
+ like Latin or Greek — a language you should learn as an intellectual
811
+ exercise, even though you won't actually use it:
812
+
813
+ Lisp is worth learning for the profound enlightenment experience
814
+ you will have when you finally get it; that experience will make
815
+ you a better programmer for the rest of your days, even if you
816
+ never actually use Lisp itself a lot.
817
+
818
+ If I didn't know Lisp, reading this would set me asking questions.
819
+ A language that would make me a better programmer, if it means
820
+ anything at all, means a language that would be better for programming.
821
+ And that is in fact the implication of what Eric is saying.As long as that idea is still floating around, I think hackers will
822
+ be receptive enough to a new Lisp, even if it is called Lisp. But
823
+ this Lisp must be a hacker's language, like the classic Lisps of
824
+ the 1970s. It must be terse, simple, and hackable. And it must have
825
+ powerful libraries for doing what hackers want to do now.In the matter of libraries I think there is room to beat languages
826
+ like Perl and Python at their own game. A lot of the new applications
827
+ that will need to be written in the coming years will be
828
+ server-based
829
+ applications. There's no reason a new Lisp shouldn't have string
830
+ libraries as good as Perl, and if this new Lisp also had powerful
831
+ libraries for server-based applications, it could be very popular.
832
+ Real hackers won't turn up their noses at a new tool that will let
833
+ them solve hard problems with a few library calls. Remember, hackers
834
+ are lazy.It could be an even bigger win to have core language support for
835
+ server-based applications. For example, explicit support for programs
836
+ with multiple users, or data ownership at the level of type tags.Server-based applications also give us the answer to the question
837
+ of what this new Lisp will be used to hack. It would not hurt to
838
+ make Lisp better as a scripting language for Unix. (It would be
839
+ hard to make it worse.) But I think there are areas where existing
840
+ languages would be easier to beat. I think it might be better to
841
+ follow the model of Tcl, and supply the Lisp together with a complete
842
+ system for supporting server-based applications. Lisp is a natural
843
+ fit for server-based applications. Lexical closures provide a way
844
+ to get the effect of subroutines when the ui is just a series of
845
+ web pages. S-expressions map nicely onto html, and macros are good
846
+ at generating it. There need to be better tools for writing
847
+ server-based applications, and there needs to be a new Lisp, and
848
+ the two would work very well together.12 The Dream LanguageBy way of summary, let's try describing the hacker's dream language.
849
+ The dream language is
850
+ beautiful, clean, and terse. It has an
851
+ interactive toplevel that starts up fast. You can write programs
852
+ to solve common problems with very little code. Nearly all the
853
+ code in any program you write is code that's specific to your
854
+ application. Everything else has been done for you.The syntax of the language is brief to a fault. You never have to
855
+ type an unnecessary character, or even to use the shift key much.Using big abstractions you can write the first version of a program
856
+ very quickly. Later, when you want to optimize, there's a really
857
+ good profiler that tells you where to focus your attention. You
858
+ can make inner loops blindingly fast, even writing inline byte code
859
+ if you need to.There are lots of good examples to learn from, and the language is
860
+ intuitive enough that you can learn how to use it from examples in
861
+ a couple minutes. You don't need to look in the manual much. The
862
+ manual is thin, and has few warnings and qualifications.The language has a small core, and powerful, highly orthogonal
863
+ libraries that are as carefully designed as the core language. The
864
+ libraries all work well together; everything in the language fits
865
+ together like the parts in a fine camera. Nothing is deprecated,
866
+ or retained for compatibility. The source code of all the libraries
867
+ is readily available. It's easy to talk to the operating system
868
+ and to applications written in other languages.The language is built in layers. The higher-level abstractions are
869
+ built in a very transparent way out of lower-level abstractions,
870
+ which you can get hold of if you want.Nothing is hidden from you that doesn't absolutely have to be. The
871
+ language offers abstractions only as a way of saving you work,
872
+ rather than as a way of telling you what to do. In fact, the language
873
+ encourages you to be an equal participant in its design. You can
874
+ change everything about it, including even its syntax, and anything
875
+ you write has, as much as possible, the same status as what comes
876
+ predefined.Notes[1] Macros very close to the modern idea were proposed by Timothy
877
+ Hart in 1964, two years after Lisp 1.5 was released. What was
878
+ missing, initially, were ways to avoid variable capture and multiple
879
+ evaluation; Hart's examples are subject to both.[2] In When the Air Hits Your Brain, neurosurgeon Frank Vertosick
880
+ recounts a conversation in which his chief resident, Gary, talks
881
+ about the difference between surgeons and internists ("fleas"):
882
+
883
+ Gary and I ordered a large pizza and found an open booth. The
884
+ chief lit a cigarette. "Look at those goddamn fleas, jabbering
885
+ about some disease they'll see once in their lifetimes. That's
886
+ the trouble with fleas, they only like the bizarre stuff. They
887
+ hate their bread and butter cases. That's the difference between
888
+ us and the fucking fleas. See, we love big juicy lumbar disc
889
+ herniations, but they hate hypertension...."
890
+
891
+ It's hard to think of a lumbar disc herniation as juicy (except
892
+ literally). And yet I think I know what they mean. I've often had
893
+ a juicy bug to track down. Someone who's not a programmer would
894
+ find it hard to imagine that there could be pleasure in a bug.
895
+ Surely it's better if everything just works. In one way, it is.
896
+ And yet there is undeniably a grim satisfaction in hunting down
897
+ certain sorts of bugs.
898
+ ```
out_tensor/lm_head.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e83ee9ad17b69f0006c79a0db328747dd20fc7a99f2527e44e47b7961132b69e
3
+ size 103953008
out_tensor/model.layers.0.mlp.down_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58b753a71e9cbb8edeee01f0f74a633fc14fe4247f8d9470d2f431e715dabe86
3
+ size 59008184
out_tensor/model.layers.0.mlp.gate_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bbb48147fe352e5d4839aa006d60153bcb0324af3c5bb99d7f4189804f7a55f
3
+ size 58966744
out_tensor/model.layers.0.mlp.up_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c4dd82135f6e65d0fb8efff27d45d256518b12969a5545eeb3ea8f5fdeb6e92
3
+ size 58966728
out_tensor/model.layers.0.self_attn.k_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b648ecc13d09154b96fdd9d90051c9a04ca824d20971dcfa0c2dbace8fa659d
3
+ size 4227800
out_tensor/model.layers.0.self_attn.o_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c596236068604d0c4bb1ae70a9ea53028c2d6a0ece26a7346bfd1fe29d6fb9ce
3
+ size 16859872
out_tensor/model.layers.0.self_attn.q_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:937ac10f7c027aa74a5a993ede3bc8528178a8ba925c73793c48fca572427e0f
3
+ size 16859872
out_tensor/model.layers.0.self_attn.v_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadd9c3ee54460202627ca5139ecbe9e885eebbc12e5eaf9b5988e0a1df80ca4
3
+ size 4227800
out_tensor/model.layers.1.mlp.down_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e4c9a98433ac336c7da610adf2b9f23b4fe9f63536506a03ce8f4173d72770b
3
+ size 59008184
out_tensor/model.layers.1.mlp.gate_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84a7e2ac594e4c85c42bde532feb4059a98b4e1f81f5d311e18468d4deede457
3
+ size 58966744
out_tensor/model.layers.1.mlp.up_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:509af50ac331eb7d333249885a66436202fef7b7f50e0c9817c8ce37d51056cf
3
+ size 58966728
out_tensor/model.layers.1.self_attn.k_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f102631c5753effe17e396aa1bb8f09b5f7727af8d078b8f45028ed696419d00
3
+ size 4227800
out_tensor/model.layers.1.self_attn.o_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52bd7ccbd8b4cb551ed0e2fdb93ddaa626dd67c72f5625897abeac865596be8d
3
+ size 16859872
out_tensor/model.layers.1.self_attn.q_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aacb3dc932fc4c0e18503f37e7ff81b2aebd1d4e4db1dfd078b7c5949c416cf5
3
+ size 16859872
out_tensor/model.layers.1.self_attn.v_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e47471ee6cf191c4ed397002ad071209df09f15f8b87a3cd73a812f5d4b381de
3
+ size 4227800
out_tensor/model.layers.10.mlp.down_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f863b6d9f46e5b6c60f2f538b89847a115d92e1e06b05ccc5c0cbd556ed3ac70
3
+ size 59008192
out_tensor/model.layers.10.mlp.gate_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82eabda5dfaa8dc4c30e8f653408e7e3d01d2fcbf1bcd2cf09a591708eef9916
3
+ size 58966744
out_tensor/model.layers.10.mlp.up_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71966f0ef3456624c615909999369a7caf3b5c9de2a6f5aebdbec25d52ee2221
3
+ size 58966736
out_tensor/model.layers.10.self_attn.k_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fad96aa9e520072456007d81da58816242764edd90eacfbaff657966a6b4e79
3
+ size 3228960
out_tensor/model.layers.10.self_attn.o_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29184abc9a15291f76e4907ff653a1550da4bbe273bc28e0ae9fc9a440fd16ec
3
+ size 12862760
out_tensor/model.layers.10.self_attn.q_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b89fd52d1bc8de6dfb73642230d7dd152a5bbb6cf54ae1bfc3903cb7a0c99584
3
+ size 12862760
out_tensor/model.layers.10.self_attn.v_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a48605e596dabe902ff98039a73bfa1fe13f2fcd88fbeb13547e4821376595e
3
+ size 4277536
out_tensor/model.layers.11.mlp.down_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17a61ddc6f0678e896cf52ccbc8a432d74803c3d01aa06379e991b11c91d65b2
3
+ size 59008192
out_tensor/model.layers.11.mlp.gate_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff54e50b631a4086f367d8218c6656c41a0fb69cfd7c00099c5cc8a89f57dcdd
3
+ size 58966744
out_tensor/model.layers.11.mlp.up_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ae0ebb419ff3efca4e0810fbc77f9ede58eb58ef9535746ada2c7abe15f21b5
3
+ size 58966736
out_tensor/model.layers.11.self_attn.k_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aa55accca8bc2457a8eeb5748c28fe04b706298438192341d327f94f70dcf5f
3
+ size 3228960
out_tensor/model.layers.11.self_attn.o_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4ee4581744296fe89ce897037879eba1611845889966c7c9a25352225d0d344
3
+ size 12862760
out_tensor/model.layers.11.self_attn.q_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d51541b3a96aa41df3375312ebf56a5cd583ed39bcd439311b29bb22a292b4d
3
+ size 12862760
out_tensor/model.layers.11.self_attn.v_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c1683279c8a302ef40142fce030a358a855ac025485a30612f0835a79f35c1a
3
+ size 4277536
out_tensor/model.layers.12.mlp.down_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96a40603ee96fe2f7c03b25fd31d52026c8bfe453bdfaeb12a216699fb244620
3
+ size 59008192
out_tensor/model.layers.12.mlp.gate_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5526c9ccea1f8253c76a8865fb720fd14260672ba743338fab2f45e6ca38304
3
+ size 58966744
out_tensor/model.layers.12.mlp.up_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6097378e665a6a7084648e99386f31c05c672a21ba28a84cd79d1b5d95f5a919
3
+ size 58966736
out_tensor/model.layers.12.self_attn.k_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b45f1fdb613a1afa4dfdd94c11d1f36ed870a8dd4739e9a3edfada57a6bd7d3
3
+ size 4227808
out_tensor/model.layers.12.self_attn.o_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3eac9cf397880af4c6cb35a91efacfc40f0c6e7dbf49e8b18e82ff8ab391e371
3
+ size 16859880
out_tensor/model.layers.12.self_attn.q_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e3e7f6d25e0e9f3acb2262fa26d50f9fd426b5fc832fa319538c48d81994854
3
+ size 16859880
out_tensor/model.layers.12.self_attn.v_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:893c79c250a02a7ef526359aa8a1306f1f64a8abe64f4e3c09ad5906e8f37b83
3
+ size 4227808
out_tensor/model.layers.13.mlp.down_proj.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89ed950d55c7b603a99889aee7cd5aa35690c0f4cceccc2d98f14118f44dfa38
3
+ size 59008192