booydar commited on
Commit
649e5b3
1 Parent(s): 170a088

add llama3.1 + average tab

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +1 -1
  2. data/BABILong NeurIPS24 Figs - leaderboard.csv +2 -0
  3. notebooks/process_results_csv.ipynb +291 -0
  4. notebooks/test.ipynb +0 -78
  5. results/01-ai Yi-34B-200k/avg/0.csv +2 -0
  6. results/01-ai Yi-34B-200k/avg/1000.csv +2 -0
  7. results/01-ai Yi-34B-200k/avg/16000.csv +2 -0
  8. results/01-ai Yi-34B-200k/avg/2000.csv +2 -0
  9. results/01-ai Yi-34B-200k/avg/32000.csv +2 -0
  10. results/01-ai Yi-34B-200k/avg/4000.csv +2 -0
  11. results/01-ai Yi-34B-200k/avg/64000.csv +2 -0
  12. results/01-ai Yi-34B-200k/avg/8000.csv +2 -0
  13. results/01-ai Yi-34B/avg/0.csv +2 -0
  14. results/01-ai Yi-34B/avg/1000.csv +2 -0
  15. results/01-ai Yi-34B/avg/16000.csv +2 -0
  16. results/01-ai Yi-34B/avg/2000.csv +2 -0
  17. results/01-ai Yi-34B/avg/32000.csv +2 -0
  18. results/01-ai Yi-34B/avg/4000.csv +2 -0
  19. results/01-ai Yi-34B/avg/8000.csv +2 -0
  20. results/01-ai Yi-9B-200k/avg/0.csv +2 -0
  21. results/01-ai Yi-9B-200k/avg/1000.csv +2 -0
  22. results/01-ai Yi-9B-200k/avg/128000.csv +2 -0
  23. results/01-ai Yi-9B-200k/avg/16000.csv +2 -0
  24. results/01-ai Yi-9B-200k/avg/2000.csv +2 -0
  25. results/01-ai Yi-9B-200k/avg/32000.csv +2 -0
  26. results/01-ai Yi-9B-200k/avg/4000.csv +2 -0
  27. results/01-ai Yi-9B-200k/avg/64000.csv +2 -0
  28. results/01-ai Yi-9B-200k/avg/8000.csv +2 -0
  29. results/GPT-2 (137M)/avg/0.csv +2 -0
  30. results/GPT-2 (137M)/avg/1000.csv +2 -0
  31. results/GPT-4 (gpt-4-0125-preview)/avg/0.csv +2 -0
  32. results/GPT-4 (gpt-4-0125-preview)/avg/1000.csv +2 -0
  33. results/GPT-4 (gpt-4-0125-preview)/avg/128000.csv +2 -0
  34. results/GPT-4 (gpt-4-0125-preview)/avg/16000.csv +2 -0
  35. results/GPT-4 (gpt-4-0125-preview)/avg/2000.csv +2 -0
  36. results/GPT-4 (gpt-4-0125-preview)/avg/32000.csv +2 -0
  37. results/GPT-4 (gpt-4-0125-preview)/avg/4000.csv +2 -0
  38. results/GPT-4 (gpt-4-0125-preview)/avg/64000.csv +2 -0
  39. results/GPT-4 (gpt-4-0125-preview)/avg/8000.csv +2 -0
  40. results/LLaMA-2-7B-32K/avg/0.csv +2 -0
  41. results/LLaMA-2-7B-32K/avg/1000.csv +2 -0
  42. results/LLaMA-2-7B-32K/avg/16000.csv +2 -0
  43. results/LLaMA-2-7B-32K/avg/2000.csv +2 -0
  44. results/LLaMA-2-7B-32K/avg/32000.csv +2 -0
  45. results/LLaMA-2-7B-32K/avg/4000.csv +2 -0
  46. results/LLaMA-2-7B-32K/avg/8000.csv +2 -0
  47. results/Llama-2-7B-32K-Instruct/avg/0.csv +2 -0
  48. results/Llama-2-7B-32K-Instruct/avg/1000.csv +2 -0
  49. results/Llama-2-7B-32K-Instruct/avg/16000.csv +2 -0
  50. results/Llama-2-7B-32K-Instruct/avg/2000.csv +2 -0
app.py CHANGED
@@ -86,7 +86,7 @@ def build_leaderboard_tab(folders):
86
  }
87
 
88
  with gr.Tabs() as tabs:
89
- for tab_id, tab_name in enumerate(['qa1','qa2', 'qa3', 'qa4', 'qa5']):
90
  df = load_model(folders, tab_name, msg_lengths)
91
  cmap = LinearSegmentedColormap.from_list('ryg', ["red", "yellow", "green"], N=256)
92
 
 
86
  }
87
 
88
  with gr.Tabs() as tabs:
89
+ for tab_id, tab_name in enumerate(['avg', 'qa1','qa2', 'qa3', 'qa4', 'qa5']):
90
  df = load_model(folders, tab_name, msg_lengths)
91
  cmap = LinearSegmentedColormap.from_list('ryg', ["red", "yellow", "green"], N=256)
92
 
data/BABILong NeurIPS24 Figs - leaderboard.csv CHANGED
@@ -21,8 +21,10 @@ activation-beacon-mistral-7b,avg,59,56,51,48,43,37,36,27,14,,,
21
  Phi-3-mini-128k-instruct,avg,64,57,55,51,50,46,42,37,7,,,
22
  ai21labs/Jamba-v0.1,avg,65,53,50,48,46,45,41,40,34,,,
23
  c4ai-command-r-v01,avg,64,64,63,61,59,52,51,46,38,,,
 
24
  Phi-3-medium-128k-instruct,avg,72,70,67,62,60,57,53,45,30,,,
25
  GPT-4,avg,87,81,77,74,71,64,53,43,36,,,
 
26
  ~ Mamba (130M) fine-tune,avg,,,,"98,7","98,5","98,5","98,1",97,"92,5",,,
27
  Llama3-ChatQA-1.5-8B + RAG,avg,48,48,47,46,45,45,44,42,45,42,39,37
28
  ~ RMT (137M) fine-tune,avg,"99,36","97,4","94,66","92,32","89,9","85,62","77,88","69,86","58,52","46,36","42,84","33,78"
 
21
  Phi-3-mini-128k-instruct,avg,64,57,55,51,50,46,42,37,7,,,
22
  ai21labs/Jamba-v0.1,avg,65,53,50,48,46,45,41,40,34,,,
23
  c4ai-command-r-v01,avg,64,64,63,61,59,52,51,46,38,,,
24
+ Meta-Llama-3.1-8B-Instruct,avg,67,68,66,66,62,60,56,49,39,,,
25
  Phi-3-medium-128k-instruct,avg,72,70,67,62,60,57,53,45,30,,,
26
  GPT-4,avg,87,81,77,74,71,64,53,43,36,,,
27
+ Meta-Llama-3.1-70B-Instruct,avg,85,81,78,74,70,65,59,53,45,,,
28
  ~ Mamba (130M) fine-tune,avg,,,,"98,7","98,5","98,5","98,1",97,"92,5",,,
29
  Llama3-ChatQA-1.5-8B + RAG,avg,48,48,47,46,45,45,44,42,45,42,39,37
30
  ~ RMT (137M) fine-tune,avg,"99,36","97,4","94,66","92,32","89,9","85,62","77,88","69,86","58,52","46,36","42,84","33,78"
notebooks/process_results_csv.ipynb ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import os\n",
11
+ "\n",
12
+ "import re"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 2,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "res_path = '../results'"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 4,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "p = \"/home/jovyan/rmt/babilong-leaderboard/data/BABILong NeurIPS24 Figs - leaderboard.csv\"\n",
31
+ "res_df = pd.read_csv(p)\n",
32
+ "# res_df = res_df[res_df.task.isin(['qa1', 'qa2', 'qa3', 'qa4', 'qa5'])]"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 5,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "lens = [0, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 500000, 1000000, 10000000]\n",
42
+ "len_names = ['0K', '1K', '2K', '4K', '8K', '16K', '32K', '64K', '128K', '512K', '1M', '10M']\n",
43
+ "\n",
44
+ "for model_name in res_df.Model.unique():\n",
45
+ " model_df = res_df[res_df.Model == model_name]\n",
46
+ " model_name = re.sub('/', ' ', model_name)\n",
47
+ " for i, row in model_df.iterrows():\n",
48
+ " for l, ln in zip(lens, len_names):\n",
49
+ " score = row[ln]\n",
50
+ " # print(score)\n",
51
+ " if not pd.isna(score):\n",
52
+ " score = re.sub(',', '.', score)\n",
53
+ " score = float(score) / 100\n",
54
+ " os.makedirs(os.path.join(res_path, model_name), exist_ok=True)\n",
55
+ " os.makedirs(os.path.join(res_path, model_name, row.task), exist_ok=True)\n",
56
+ " path = os.path.join(res_path, model_name, row.task, f'{l}.csv')\n",
57
+ " df = pd.DataFrame([{'result': score}])\n",
58
+ " df.to_csv(path, index=False)"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "markdown",
63
+ "metadata": {},
64
+ "source": [
65
+ "### Calculate average results"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 10,
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "model_names = next(os.walk(res_path))[1]"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 11,
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "import numpy as np"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 14,
89
+ "metadata": {},
90
+ "outputs": [
91
+ {
92
+ "data": {
93
+ "text/html": [
94
+ "<div>\n",
95
+ "<style scoped>\n",
96
+ " .dataframe tbody tr th:only-of-type {\n",
97
+ " vertical-align: middle;\n",
98
+ " }\n",
99
+ "\n",
100
+ " .dataframe tbody tr th {\n",
101
+ " vertical-align: top;\n",
102
+ " }\n",
103
+ "\n",
104
+ " .dataframe thead th {\n",
105
+ " text-align: right;\n",
106
+ " }\n",
107
+ "</style>\n",
108
+ "<table border=\"1\" class=\"dataframe\">\n",
109
+ " <thead>\n",
110
+ " <tr style=\"text-align: right;\">\n",
111
+ " <th></th>\n",
112
+ " <th>1</th>\n",
113
+ " </tr>\n",
114
+ " </thead>\n",
115
+ " <tbody>\n",
116
+ " <tr>\n",
117
+ " <th>0</th>\n",
118
+ " <td>2</td>\n",
119
+ " </tr>\n",
120
+ " </tbody>\n",
121
+ "</table>\n",
122
+ "</div>"
123
+ ],
124
+ "text/plain": [
125
+ " 1\n",
126
+ "0 2"
127
+ ]
128
+ },
129
+ "execution_count": 14,
130
+ "metadata": {},
131
+ "output_type": "execute_result"
132
+ }
133
+ ],
134
+ "source": [
135
+ "pd.DataFrame([{1: 2}])"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 17,
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "data": {
145
+ "text/plain": [
146
+ "'../results/GPT-3.5 fine-tuned (trained on 100 samples)/qa2'"
147
+ ]
148
+ },
149
+ "execution_count": 17,
150
+ "metadata": {},
151
+ "output_type": "execute_result"
152
+ }
153
+ ],
154
+ "source": [
155
+ "task_path"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": 28,
161
+ "metadata": {},
162
+ "outputs": [
163
+ {
164
+ "name": "stdout",
165
+ "output_type": "stream",
166
+ "text": [
167
+ "GPT-4\n",
168
+ "GPT-3.5 fine-tuned (trained on 100 samples)\n",
169
+ "GPT-3.5 fine-tuned (trained on 1000 samples)\n",
170
+ "GPT-3.5\n",
171
+ "GPT4 + RAG by segments\n",
172
+ "GPT4 + RAG by sentences\n",
173
+ "GPT4 + Retrieve sentences (new 100 samples)\n",
174
+ "Mistral medium (xxB)\n",
175
+ "Mistral\n",
176
+ "GPT-2 (137M)\n",
177
+ "mamba-2.8b-hf\n",
178
+ "rwkv-6-world-7b\n",
179
+ "v5-Eagle-7B-HF\n",
180
+ "Meta-Llama-3-8B-Instruct\n",
181
+ "LLaMA-2-7B-32K\n",
182
+ "longchat-7b-v1.5-32k\n",
183
+ "LongAlpaca-13B\n",
184
+ "Llama-2-7B-32K-Instruct\n",
185
+ "Mistral-7b-Instruct-v0.2\n",
186
+ "Mixtral-8x7B-Instruct-v0.1\n",
187
+ "Mixtral-8x22B-Instruct-v0.1\n",
188
+ "activation-beacon-llama2-7b-chat\n",
189
+ "Yarn-Mistral-7b-128k\n",
190
+ "chatglm3-6b-128k\n",
191
+ "activation-beacon-mistral-7b\n",
192
+ "Phi-3-mini-128k-instruct\n",
193
+ "c4ai-command-r-v01\n",
194
+ "Phi-3-medium-128k-instruct\n",
195
+ "~ Mamba (130M) fine-tune\n",
196
+ "Llama3-ChatQA-1.5-8B + RAG\n",
197
+ "~ RMT (137M) fine-tune\n",
198
+ "~ ARMT (137M) fine-tune\n",
199
+ "01-ai Yi-34B\n",
200
+ "01-ai Yi-34B-200k\n",
201
+ "01-ai Yi-9B-200k\n",
202
+ "ai21labs Jamba-v0.1\n",
203
+ "~ RMT-Retrieval (137M) fine-tune\n",
204
+ "GPT-4 (gpt-4-0125-preview)\n",
205
+ "Meta-Llama-3.1-8B-Instruct\n",
206
+ "Meta-Llama-3.1-70B-Instruct\n"
207
+ ]
208
+ }
209
+ ],
210
+ "source": [
211
+ "for mn in model_names:\n",
212
+ " print(mn)\n",
213
+ " avg_path = os.path.join(res_path, mn, 'avg')\n",
214
+ " if os.path.exists(avg_path):\n",
215
+ " continue\n",
216
+ " \n",
217
+ " scores = {}\n",
218
+ " for task_name in [f'qa{i}' for i in range(1, 6)]:\n",
219
+ " task_path = os.path.join(res_path, mn, task_name)\n",
220
+ " if not os.path.exists(task_path):\n",
221
+ " continue\n",
222
+ "\n",
223
+ " filenames = next(os.walk(task_path))[2]\n",
224
+ " for fn in filenames:\n",
225
+ " len_name = fn.split('.')[0]\n",
226
+ " df = pd.read_csv(os.path.join(task_path, fn))\n",
227
+ " \n",
228
+ " score = df.result.mean()\n",
229
+ " if len_name not in scores:\n",
230
+ " scores[len_name] = [score]\n",
231
+ " else:\n",
232
+ " scores[len_name].append(score)\n",
233
+ "\n",
234
+ " for k,v in scores.items():\n",
235
+ " sc = np.mean(v)\n",
236
+ " out_path = os.path.join(avg_path, k + '.csv')\n",
237
+ " df = pd.DataFrame([{'result': sc}])\n",
238
+ " if len(v) < 5:\n",
239
+ " continue\n",
240
+ " os.makedirs(avg_path, exist_ok=True)\n",
241
+ " df.to_csv(out_path, index=False)\n",
242
+ " print(out_path)\n",
243
+ " # 1/0\n",
244
+ "\n",
245
+ "\n",
246
+ "\n"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": 27,
252
+ "metadata": {},
253
+ "outputs": [
254
+ {
255
+ "data": {
256
+ "text/plain": [
257
+ "{'16000': [0.58], '32000': [0.33], '4000': [0.73], '8000': [0.75]}"
258
+ ]
259
+ },
260
+ "execution_count": 27,
261
+ "metadata": {},
262
+ "output_type": "execute_result"
263
+ }
264
+ ],
265
+ "source": [
266
+ "scores"
267
+ ]
268
+ }
269
+ ],
270
+ "metadata": {
271
+ "kernelspec": {
272
+ "display_name": "Python 3",
273
+ "language": "python",
274
+ "name": "python3"
275
+ },
276
+ "language_info": {
277
+ "codemirror_mode": {
278
+ "name": "ipython",
279
+ "version": 3
280
+ },
281
+ "file_extension": ".py",
282
+ "mimetype": "text/x-python",
283
+ "name": "python",
284
+ "nbconvert_exporter": "python",
285
+ "pygments_lexer": "ipython3",
286
+ "version": "3.11.9"
287
+ }
288
+ },
289
+ "nbformat": 4,
290
+ "nbformat_minor": 2
291
+ }
notebooks/test.ipynb DELETED
@@ -1,78 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import pandas as pd\n",
10
- "import os"
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": 24,
16
- "metadata": {},
17
- "outputs": [],
18
- "source": [
19
- "res_path = '../results'"
20
- ]
21
- },
22
- {
23
- "cell_type": "code",
24
- "execution_count": 25,
25
- "metadata": {},
26
- "outputs": [],
27
- "source": [
28
- "p = \"/home/jovyan/rmt/babilong-leaderboard/data/BABILong NeurIPS24 Figs - leaderboard.csv\"\n",
29
- "res_df = pd.read_csv(p)\n",
30
- "res_df = res_df[res_df.task.isin(['qa1', 'qa2', 'qa3', 'qa4', 'qa5'])]"
31
- ]
32
- },
33
- {
34
- "cell_type": "code",
35
- "execution_count": 30,
36
- "metadata": {},
37
- "outputs": [],
38
- "source": [
39
- "lens = [0, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 500000, 1000000, 10000000]\n",
40
- "len_names = ['0K', '1K', '2K', '4K', '8K', '16K', '32K', '64K', '128K', '512K', '1M', '10M']\n",
41
- "\n",
42
- "for model_name in res_df.Model.unique():\n",
43
- " model_df = res_df[res_df.Model == model_name]\n",
44
- " for i, row in model_df.iterrows():\n",
45
- " for l, ln in zip(lens, len_names):\n",
46
- " score = row[ln]\n",
47
- " # print(score)\n",
48
- " if not pd.isna(score):\n",
49
- " os.makedirs(os.path.join(res_path, model_name), exist_ok=True)\n",
50
- " os.makedirs(os.path.join(res_path, model_name, row.task), exist_ok=True)\n",
51
- " path = os.path.join(res_path, model_name, row.task, f'{l}.csv')\n",
52
- " df = pd.DataFrame([{'result': score}])\n",
53
- " df.to_csv(path, index=False)"
54
- ]
55
- }
56
- ],
57
- "metadata": {
58
- "kernelspec": {
59
- "display_name": "Python 3",
60
- "language": "python",
61
- "name": "python3"
62
- },
63
- "language_info": {
64
- "codemirror_mode": {
65
- "name": "ipython",
66
- "version": 3
67
- },
68
- "file_extension": ".py",
69
- "mimetype": "text/x-python",
70
- "name": "python",
71
- "nbconvert_exporter": "python",
72
- "pygments_lexer": "ipython3",
73
- "version": "3.10.13"
74
- }
75
- },
76
- "nbformat": 4,
77
- "nbformat_minor": 2
78
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/01-ai Yi-34B-200k/avg/0.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.65
results/01-ai Yi-34B-200k/avg/1000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.59
results/01-ai Yi-34B-200k/avg/16000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.5
results/01-ai Yi-34B-200k/avg/2000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.56
results/01-ai Yi-34B-200k/avg/32000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.48
results/01-ai Yi-34B-200k/avg/4000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.54
results/01-ai Yi-34B-200k/avg/64000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.48
results/01-ai Yi-34B-200k/avg/8000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.52
results/01-ai Yi-34B/avg/0.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.72
results/01-ai Yi-34B/avg/1000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.52
results/01-ai Yi-34B/avg/16000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.31
results/01-ai Yi-34B/avg/2000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.43
results/01-ai Yi-34B/avg/32000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.04
results/01-ai Yi-34B/avg/4000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.37
results/01-ai Yi-34B/avg/8000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.38
results/01-ai Yi-9B-200k/avg/0.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.52
results/01-ai Yi-9B-200k/avg/1000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.55
results/01-ai Yi-9B-200k/avg/128000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.24
results/01-ai Yi-9B-200k/avg/16000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.36
results/01-ai Yi-9B-200k/avg/2000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.48
results/01-ai Yi-9B-200k/avg/32000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.37
results/01-ai Yi-9B-200k/avg/4000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.46
results/01-ai Yi-9B-200k/avg/64000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.29
results/01-ai Yi-9B-200k/avg/8000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.45
results/GPT-2 (137M)/avg/0.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.27
results/GPT-2 (137M)/avg/1000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.15
results/GPT-4 (gpt-4-0125-preview)/avg/0.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.874
results/GPT-4 (gpt-4-0125-preview)/avg/1000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.8140000000000001
results/GPT-4 (gpt-4-0125-preview)/avg/128000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.358
results/GPT-4 (gpt-4-0125-preview)/avg/16000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.6399999999999999
results/GPT-4 (gpt-4-0125-preview)/avg/2000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.768
results/GPT-4 (gpt-4-0125-preview)/avg/32000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.526
results/GPT-4 (gpt-4-0125-preview)/avg/4000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.738
results/GPT-4 (gpt-4-0125-preview)/avg/64000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.42800000000000005
results/GPT-4 (gpt-4-0125-preview)/avg/8000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.7120000000000001
results/LLaMA-2-7B-32K/avg/0.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.41
results/LLaMA-2-7B-32K/avg/1000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.53
results/LLaMA-2-7B-32K/avg/16000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.32
results/LLaMA-2-7B-32K/avg/2000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.45
results/LLaMA-2-7B-32K/avg/32000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.03
results/LLaMA-2-7B-32K/avg/4000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.4
results/LLaMA-2-7B-32K/avg/8000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.39
results/Llama-2-7B-32K-Instruct/avg/0.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.49
results/Llama-2-7B-32K-Instruct/avg/1000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.52
results/Llama-2-7B-32K-Instruct/avg/16000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.35
results/Llama-2-7B-32K-Instruct/avg/2000.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ result
2
+ 0.49