Update README.md
Browse files
README.md
CHANGED
@@ -37,6 +37,87 @@ Merges:
|
|
37 |
- Tokenizers 0.14.1
|
38 |
|
39 |
## Evals LM-Evaluation Harness
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
`big-refactor` branch:
|
42 |
|
|
|
37 |
- Tokenizers 0.14.1
|
38 |
|
39 |
## Evals LM-Evaluation Harness
|
40 |
+
`mt-bench`:
|
41 |
+
```
|
42 |
+
Mode: single
|
43 |
+
Input file: data/mt_bench/model_judgment/gpt-4_single.jsonl
|
44 |
+
|
45 |
+
########## First turn ##########
|
46 |
+
score
|
47 |
+
model turn
|
48 |
+
gpt-4 1 8.95625
|
49 |
+
claude-v1 1 8.15000
|
50 |
+
gpt-3.5-turbo 1 8.07500
|
51 |
+
LUNA-SOLARkrautLM-Instruct 1 7.93750
|
52 |
+
UNA-SOLAR-10.7B-Instruct-v1.0 1 7.80625
|
53 |
+
vicuna-33b-v1.3 1 7.45625
|
54 |
+
wizardlm-30b 1 7.13125
|
55 |
+
tulu-30b 1 7.01875
|
56 |
+
vicuna-13b-v1.3 1 6.81250
|
57 |
+
guanaco-65b 1 6.78125
|
58 |
+
nous-hermes-13b 1 6.43125
|
59 |
+
alpaca-13b 1 4.97500
|
60 |
+
rwkv-4-raven-14b 1 4.74375
|
61 |
+
llama-13b 1 3.26250
|
62 |
+
|
63 |
+
########## Second turn ##########
|
64 |
+
score
|
65 |
+
model turn
|
66 |
+
gpt-4 2 9.025000
|
67 |
+
gpt-3.5-turbo 2 7.812500
|
68 |
+
claude-v1 2 7.650000
|
69 |
+
UNA-SOLAR-10.7B-Instruct-v1.0 2 7.237500
|
70 |
+
LUNA-SOLARkrautLM-Instruct 2 6.987500
|
71 |
+
wizardlm-30b 2 6.887500
|
72 |
+
vicuna-33b-v1.3 2 6.787500
|
73 |
+
guanaco-65b 2 6.037500
|
74 |
+
vicuna-13b-v1.3 2 5.962500
|
75 |
+
tulu-30b 2 5.850000
|
76 |
+
nous-hermes-13b 2 4.664557
|
77 |
+
alpaca-13b 2 4.087500
|
78 |
+
rwkv-4-raven-14b 2 3.225000
|
79 |
+
llama-13b 2 1.950000
|
80 |
+
|
81 |
+
########## Average ##########
|
82 |
+
score
|
83 |
+
model
|
84 |
+
gpt-4 8.990625
|
85 |
+
gpt-3.5-turbo 7.943750
|
86 |
+
claude-instant-v1 7.905660
|
87 |
+
claude-v1 7.900000
|
88 |
+
UNA-SOLAR-10.7B-Instruct-v1.0 7.521875
|
89 |
+
LUNA-SOLARkrautLM-Instruct 7.462500
|
90 |
+
vicuna-33b-v1.3 7.121875
|
91 |
+
wizardlm-30b 7.009375
|
92 |
+
Llama-2-70b-chat 6.856250
|
93 |
+
Llama-2-13b-chat 6.650000
|
94 |
+
guanaco-33b 6.528125
|
95 |
+
tulu-30b 6.434375
|
96 |
+
guanaco-65b 6.409375
|
97 |
+
oasst-sft-7-llama-30b 6.409375
|
98 |
+
palm-2-chat-bison-001 6.400000
|
99 |
+
mpt-30b-chat 6.393750
|
100 |
+
vicuna-13b-v1.3 6.387500
|
101 |
+
wizardlm-13b 6.353125
|
102 |
+
Llama-2-7b-chat 6.268750
|
103 |
+
vicuna-7b-v1.3 5.996875
|
104 |
+
baize-v2-13b 5.750000
|
105 |
+
nous-hermes-13b 5.553459
|
106 |
+
mpt-7b-chat 5.459119
|
107 |
+
gpt4all-13b-snoozy 5.452830
|
108 |
+
koala-13b 5.350000
|
109 |
+
mpt-30b-instruct 5.218750
|
110 |
+
falcon-40b-instruct 5.168750
|
111 |
+
h2ogpt-oasst-open-llama-13b 4.625000
|
112 |
+
alpaca-13b 4.531250
|
113 |
+
chatglm-6b 4.500000
|
114 |
+
oasst-sft-4-pythia-12b 4.318750
|
115 |
+
rwkv-4-raven-14b 3.984375
|
116 |
+
dolly-v2-12b 3.275000
|
117 |
+
fastchat-t5-3b 3.040625
|
118 |
+
stablelm-tuned-alpha-7b 2.753125
|
119 |
+
llama-13b 2.606250
|
120 |
+
```
|
121 |
|
122 |
`big-refactor` branch:
|
123 |
|