Sean Cho commited on
Commit
bba982c
·
1 Parent(s): fced05c

add private repo

Browse files
.gitignore CHANGED
@@ -10,6 +10,8 @@ gpt_4_evals/
10
  human_evals/
11
  eval-queue/
12
  eval-results/
 
 
13
  auto_evals/
14
 
15
  src/assets/model_counts.html
 
10
  human_evals/
11
  eval-queue/
12
  eval-results/
13
+ eval-queue-private/
14
+ eval-results-private/
15
  auto_evals/
16
 
17
  src/assets/model_counts.html
app.py CHANGED
@@ -2,6 +2,7 @@ import json
2
  import os
3
  from datetime import datetime, timezone
4
  import re
 
5
 
6
  import gradio as gr
7
  import pandas as pd
@@ -38,10 +39,10 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
38
  QUEUE_REPO = "open-ko-llm-leaderboard/requests"
39
  RESULTS_REPO = "open-ko-llm-leaderboard/results"
40
 
41
- PRIVATE_QUEUE_REPO = "open-ko-llm-leaderboard/requests"
42
- PRIVATE_RESULTS_REPO = "open-ko-llm-leaderboard/results"
43
 
44
- IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
45
 
46
  EVAL_REQUESTS_PATH = "eval-queue"
47
  EVAL_RESULTS_PATH = "eval-results"
 
2
  import os
3
  from datetime import datetime, timezone
4
  import re
5
+ from distutils.util import strtobool
6
 
7
  import gradio as gr
8
  import pandas as pd
 
39
  QUEUE_REPO = "open-ko-llm-leaderboard/requests"
40
  RESULTS_REPO = "open-ko-llm-leaderboard/results"
41
 
42
+ PRIVATE_QUEUE_REPO = "open-ko-llm-leaderboard/private-requests"
43
+ PRIVATE_RESULTS_REPO = "open-ko-llm-leaderboard/private-results"
44
 
45
+ IS_PUBLIC = bool(strtobool(os.environ.get("IS_PUBLIC", "True")))
46
 
47
  EVAL_REQUESTS_PATH = "eval-queue"
48
  EVAL_RESULTS_PATH = "eval-results"
src/assets/hardcoded_evals.py CHANGED
@@ -1,31 +1,5 @@
1
  from src.display_models.utils import AutoEvalColumn, model_hyperlink
2
 
3
- gpt4_values = {
4
- AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
5
- AutoEvalColumn.revision.name: "tech report",
6
- AutoEvalColumn.precision.name: None,
7
- AutoEvalColumn.average.name: 84.3,
8
- AutoEvalColumn.arc.name: 96.3,
9
- AutoEvalColumn.hellaswag.name: 95.3,
10
- AutoEvalColumn.mmlu.name: 86.4,
11
- AutoEvalColumn.truthfulqa.name: 59.0,
12
- AutoEvalColumn.dummy.name: "GPT-4",
13
- AutoEvalColumn.model_type.name: "",
14
- }
15
-
16
- gpt35_values = {
17
- AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
18
- AutoEvalColumn.revision.name: "tech report",
19
- AutoEvalColumn.precision.name: None,
20
- AutoEvalColumn.average.name: 71.9,
21
- AutoEvalColumn.arc.name: 85.2,
22
- AutoEvalColumn.hellaswag.name: 85.5,
23
- AutoEvalColumn.mmlu.name: 70.0,
24
- AutoEvalColumn.truthfulqa.name: 47.0,
25
- AutoEvalColumn.dummy.name: "GPT-3.5",
26
- AutoEvalColumn.model_type.name: "",
27
- }
28
-
29
  baseline = {
30
  AutoEvalColumn.model.name: "<p>Baseline</p>",
31
  AutoEvalColumn.revision.name: "N/A",
 
1
  from src.display_models.utils import AutoEvalColumn, model_hyperlink
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  baseline = {
4
  AutoEvalColumn.model.name: "<p>Baseline</p>",
5
  AutoEvalColumn.revision.name: "N/A",
src/display_models/read_results.py CHANGED
@@ -2,6 +2,7 @@ import json
2
  import os
3
  from dataclasses import dataclass
4
  from typing import Dict, List, Tuple
 
5
 
6
  import dateutil
7
  import numpy as np
@@ -19,7 +20,7 @@ BENCH_TO_NAME = {
19
  # TODO: Uncomment when we have results for these
20
  # "ethicalverification": AutoEvalColumn.ethicalverification.name,
21
  }
22
-
23
 
24
  @dataclass
25
  class EvalResult:
@@ -114,7 +115,7 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
114
  def get_eval_results() -> List[EvalResult]:
115
  json_filepaths = []
116
 
117
- for root, dir, files in os.walk("eval-results"):
118
  # We should only have json files in model results
119
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
120
  continue
 
2
  import os
3
  from dataclasses import dataclass
4
  from typing import Dict, List, Tuple
5
+ from distutils.util import strtobool
6
 
7
  import dateutil
8
  import numpy as np
 
20
  # TODO: Uncomment when we have results for these
21
  # "ethicalverification": AutoEvalColumn.ethicalverification.name,
22
  }
23
+ IS_PUBLIC = bool(strtobool(os.environ.get("IS_PUBLIC", "True")))
24
 
25
  @dataclass
26
  class EvalResult:
 
115
  def get_eval_results() -> List[EvalResult]:
116
  json_filepaths = []
117
 
118
+ for root, dir, files in os.walk("eval-results" + ("-private" if not IS_PUBLIC else "")):
119
  # We should only have json files in model results
120
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
121
  continue
src/load_from_hub.py CHANGED
@@ -6,13 +6,11 @@ from huggingface_hub import Repository
6
  from transformers import AutoConfig
7
  from collections import defaultdict
8
 
9
- from src.assets.hardcoded_evals import baseline, gpt4_values, gpt35_values
10
  from src.display_models.get_model_metadata import apply_metadata
11
  from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
12
  from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
13
 
14
- IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
15
-
16
 
17
  def get_all_requested_models(requested_models_dir: str) -> set[str]:
18
  depth = 1
@@ -75,10 +73,6 @@ def get_leaderboard_df(
75
 
76
  all_data = get_eval_results_dicts()
77
 
78
- if not IS_PUBLIC:
79
- all_data.append(gpt4_values)
80
- all_data.append(gpt35_values)
81
-
82
  # all_data.append(baseline)
83
  apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
84
 
 
6
  from transformers import AutoConfig
7
  from collections import defaultdict
8
 
9
+ from src.assets.hardcoded_evals import baseline
10
  from src.display_models.get_model_metadata import apply_metadata
11
  from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
12
  from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
13
 
 
 
14
 
15
  def get_all_requested_models(requested_models_dir: str) -> set[str]:
16
  depth = 1
 
73
 
74
  all_data = get_eval_results_dicts()
75
 
 
 
 
 
76
  # all_data.append(baseline)
77
  apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
78