Spaces:
Running
Running
AtsuMiyai
commited on
Commit
β’
6ec92ad
0
Parent(s):
initial commit
Browse files- .gitattributes +55 -0
- .gitignore +13 -0
- .pre-commit-config.yaml +53 -0
- Makefile +13 -0
- README.md +45 -0
- app.py +368 -0
- constants.py +67 -0
- download_from_dataset +1 -0
- pyproject.toml +13 -0
- requirements.txt +19 -0
.gitattributes
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
# Audio files - uncompressed
|
38 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
+
# Audio files - compressed
|
42 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
# Image files - uncompressed
|
48 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
+
# Image files - compressed
|
53 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
auto_evals/
|
2 |
+
venv/
|
3 |
+
__pycache__/
|
4 |
+
.env
|
5 |
+
.ipynb_checkpoints
|
6 |
+
*ipynb
|
7 |
+
.vscode/
|
8 |
+
|
9 |
+
eval-queue/
|
10 |
+
eval-results/
|
11 |
+
eval-queue-bk/
|
12 |
+
eval-results-bk/
|
13 |
+
logs/
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
default_language_version:
|
16 |
+
python: python3
|
17 |
+
|
18 |
+
ci:
|
19 |
+
autofix_prs: true
|
20 |
+
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
|
21 |
+
autoupdate_schedule: quarterly
|
22 |
+
|
23 |
+
repos:
|
24 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
25 |
+
rev: v4.3.0
|
26 |
+
hooks:
|
27 |
+
- id: check-yaml
|
28 |
+
- id: check-case-conflict
|
29 |
+
- id: detect-private-key
|
30 |
+
- id: check-added-large-files
|
31 |
+
args: ['--maxkb=1000']
|
32 |
+
- id: requirements-txt-fixer
|
33 |
+
- id: end-of-file-fixer
|
34 |
+
- id: trailing-whitespace
|
35 |
+
|
36 |
+
- repo: https://github.com/PyCQA/isort
|
37 |
+
rev: 5.12.0
|
38 |
+
hooks:
|
39 |
+
- id: isort
|
40 |
+
name: Format imports
|
41 |
+
|
42 |
+
- repo: https://github.com/psf/black
|
43 |
+
rev: 22.12.0
|
44 |
+
hooks:
|
45 |
+
- id: black
|
46 |
+
name: Format code
|
47 |
+
additional_dependencies: ['click==8.0.2']
|
48 |
+
|
49 |
+
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
50 |
+
# Ruff version.
|
51 |
+
rev: 'v0.0.267'
|
52 |
+
hooks:
|
53 |
+
- id: ruff
|
Makefile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: style format
|
2 |
+
|
3 |
+
|
4 |
+
style:
|
5 |
+
python -m black --line-length 119 .
|
6 |
+
python -m isort .
|
7 |
+
ruff check --fix .
|
8 |
+
|
9 |
+
|
10 |
+
quality:
|
11 |
+
python -m black --check --line-length 119 .
|
12 |
+
python -m isort --check-only .
|
13 |
+
ruff check .
|
README.md
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: JMMMU Leaderboard
|
3 |
+
emoji: π₯
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.44.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
license: apache-2.0
|
11 |
+
---
|
12 |
+
|
13 |
+
# Start the configuration
|
14 |
+
|
15 |
+
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
16 |
+
|
17 |
+
Results files should have the following format and be stored as json files:
|
18 |
+
```json
|
19 |
+
{
|
20 |
+
"config": {
|
21 |
+
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
22 |
+
"model_name": "path of the model on the hub: org/model",
|
23 |
+
"model_sha": "revision on the hub",
|
24 |
+
},
|
25 |
+
"results": {
|
26 |
+
"task_name": {
|
27 |
+
"metric_name": score,
|
28 |
+
},
|
29 |
+
"task_name2": {
|
30 |
+
"metric_name": score,
|
31 |
+
}
|
32 |
+
}
|
33 |
+
}
|
34 |
+
```
|
35 |
+
|
36 |
+
Request files are created automatically by this tool.
|
37 |
+
|
38 |
+
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
39 |
+
|
40 |
+
# Code logic for more complex edits
|
41 |
+
|
42 |
+
You'll find
|
43 |
+
- the main table' columns names and properties in `src/display/utils.py`
|
44 |
+
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
45 |
+
- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
app.py
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import pandas as pd
|
5 |
+
import re
|
6 |
+
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
+
from collections import defaultdict
|
9 |
+
from constants import *
|
10 |
+
import os
|
11 |
+
from huggingface_hub import Repository
|
12 |
+
import json
|
13 |
+
|
14 |
+
|
15 |
+
global data_component, filter_component
|
16 |
+
|
17 |
+
|
18 |
+
TOKEN = os.environ.get("TOKEN")
|
19 |
+
repo = Repository(local_dir="./download_from_dataset", clone_from="JMMMU/leaderboard_result", repo_type="dataset", use_auth_token=TOKEN)
|
20 |
+
|
21 |
+
current_directory = os.getcwd()
|
22 |
+
|
23 |
+
|
24 |
+
def validate_model_size(s):
|
25 |
+
pattern = r'^\d+B$|^-$'
|
26 |
+
if re.match(pattern, s):
|
27 |
+
return s
|
28 |
+
else:
|
29 |
+
return '-'
|
30 |
+
|
31 |
+
|
32 |
+
def upload_file(files):
|
33 |
+
file_paths = [file.name for file in files]
|
34 |
+
return file_paths
|
35 |
+
|
36 |
+
|
37 |
+
def get_acc(data, subject_list):
|
38 |
+
acc = 0
|
39 |
+
for subject in subject_list:
|
40 |
+
acc += data["results"][subject]['jmmmu_acc,none']
|
41 |
+
acc = acc/len(subject_list)
|
42 |
+
acc = acc * 100
|
43 |
+
acc = round(acc, 1)
|
44 |
+
return acc
|
45 |
+
|
46 |
+
|
47 |
+
def calculate_score(input_file):
|
48 |
+
json_string = input_file.decode('utf-8')
|
49 |
+
data = json.loads(json_string)
|
50 |
+
result_dict = {}
|
51 |
+
|
52 |
+
overall = data["results"]["jmmmu"]['jmmmu_acc,none']*100
|
53 |
+
ca = data["results"]["culture_agnostic"]['jmmmu_acc,none']*100
|
54 |
+
cs = data["results"]["culture_specific"]['jmmmu_acc,none']*100
|
55 |
+
overall = round(overall, 1)
|
56 |
+
ca = round(ca, 1)
|
57 |
+
cs = round(cs, 1)
|
58 |
+
# Art_Psychology
|
59 |
+
art_psychology_subject_list = ["jmmmu_design", "jmmmu_music", "jmmmu_psychology"]
|
60 |
+
# Science
|
61 |
+
science_subject_list = ["jmmmu_biology", "jmmmu_chemistry", "jmmmu_physics", "jmmmu_physics"]
|
62 |
+
# Business
|
63 |
+
business_subject_list = ["jmmmu_accounting", "jmmmu_economics", "jmmmu_finance", "jmmmu_manage", "jmmmu_marketing"]
|
64 |
+
# Medicine
|
65 |
+
medicine_subject_list = ["jmmmu_basic_medical_science", "jmmmu_clinical_medicine", "jmmmu_diagnostics_and_laboratory_medicine", "jmmmu_pharmacy", "jmmmu_public_health"]
|
66 |
+
# Tech_Eng.
|
67 |
+
tech_eng_subject_list = ["jmmmu_agriculture", "jmmmu_architecture_and_engineering", "jmmmu_computer_science", "jmmmu_electronics", "jmmmu_energy_and_power", "jmmmu_materials", "jmmmu_mechanical_engineering"]
|
68 |
+
|
69 |
+
jmmmu_japanese_art_subject_list = ["jmmmu_japanese_art"]
|
70 |
+
jmmmu_japanese_heritage_subject_list = ["jmmmu_japanese_heritage"]
|
71 |
+
jmmmu_japanese_history_subject_list = ["jmmmu_japanese_history"]
|
72 |
+
jmmmu_world_history_subject_list = ["jmmmu_world_history"]
|
73 |
+
|
74 |
+
art_psychology = get_acc(data, art_psychology_subject_list)
|
75 |
+
science = get_acc(data, science_subject_list)
|
76 |
+
business = get_acc(data, business_subject_list)
|
77 |
+
medicine = get_acc(data, medicine_subject_list)
|
78 |
+
tech_eng = get_acc(data, tech_eng_subject_list)
|
79 |
+
japanese_art = get_acc(data, jmmmu_japanese_art_subject_list)
|
80 |
+
japanese_heritage = get_acc(data, jmmmu_japanese_heritage_subject_list)
|
81 |
+
japanese_history = get_acc(data, jmmmu_japanese_history_subject_list)
|
82 |
+
world_history = get_acc(data, jmmmu_world_history_subject_list)
|
83 |
+
|
84 |
+
result_dict =\
|
85 |
+
{
|
86 |
+
"overall": overall,
|
87 |
+
"cultureSpecific": cs,
|
88 |
+
"cultureAgnostic": ca,
|
89 |
+
"japaneseArt": japanese_art,
|
90 |
+
"japaneseHeritage": japanese_heritage,
|
91 |
+
"japaneseHistory": japanese_history,
|
92 |
+
"worldHistory": world_history,
|
93 |
+
"artPsychology": art_psychology,
|
94 |
+
"business": business,
|
95 |
+
"science": science,
|
96 |
+
"healthMedicine": medicine,
|
97 |
+
"techEngineering": tech_eng
|
98 |
+
}
|
99 |
+
return result_dict
|
100 |
+
|
101 |
+
|
102 |
+
def add_new_eval(
|
103 |
+
input_file,
|
104 |
+
model_type: str,
|
105 |
+
model_name_textbox: str,
|
106 |
+
revision_name_textbox: str,
|
107 |
+
model_link: str,
|
108 |
+
model_size: str,
|
109 |
+
# upd_type: str,
|
110 |
+
# question_type: str
|
111 |
+
|
112 |
+
):
|
113 |
+
|
114 |
+
if input_file is None:
|
115 |
+
warning_text = "Error! Empty file!"
|
116 |
+
print(warning_text)
|
117 |
+
return warning_text
|
118 |
+
else:
|
119 |
+
model_size = validate_model_size(model_size)
|
120 |
+
# if upd_type == 'AAD':
|
121 |
+
csv_path = CSV_RESULT_PATH
|
122 |
+
|
123 |
+
# validity_check(input_file)
|
124 |
+
|
125 |
+
csv_data = pd.read_csv(csv_path)
|
126 |
+
|
127 |
+
result_dict = calculate_score(input_file)
|
128 |
+
|
129 |
+
if revision_name_textbox == '':
|
130 |
+
col = csv_data.shape[0]
|
131 |
+
model_name = model_name_textbox
|
132 |
+
else:
|
133 |
+
model_name = revision_name_textbox
|
134 |
+
model_name_list = csv_data['Model']
|
135 |
+
name_list = [name.split(']')[0][1:] for name in model_name_list]
|
136 |
+
if revision_name_textbox not in name_list:
|
137 |
+
col = csv_data.shape[0]
|
138 |
+
else:
|
139 |
+
col = name_list.index(revision_name_textbox)
|
140 |
+
model_name_wo_link = model_name
|
141 |
+
if model_link == '':
|
142 |
+
model_name = model_name # no url
|
143 |
+
else:
|
144 |
+
model_name = '[' + model_name + '](' + model_link + ')'
|
145 |
+
|
146 |
+
# add new data
|
147 |
+
new_data = [
|
148 |
+
model_type,
|
149 |
+
model_name,
|
150 |
+
model_size,
|
151 |
+
result_dict["overall"],
|
152 |
+
result_dict["cultureSpecific"],
|
153 |
+
result_dict["cultureAgnostic"],
|
154 |
+
result_dict["japaneseArt"],
|
155 |
+
result_dict["japaneseHeritage"],
|
156 |
+
result_dict["japaneseHistory"],
|
157 |
+
result_dict["worldHistory"],
|
158 |
+
result_dict["artPsychology"],
|
159 |
+
result_dict["business"],
|
160 |
+
result_dict["science"],
|
161 |
+
result_dict["healthMedicine"],
|
162 |
+
result_dict["techEngineering"]
|
163 |
+
]
|
164 |
+
|
165 |
+
# If the same data already exists, return an error.
|
166 |
+
if new_data in csv_data.values.tolist():
|
167 |
+
warning_text = "Error! The same data already exists!"
|
168 |
+
print(warning_text)
|
169 |
+
return warning_text
|
170 |
+
# If the same model name already exists, return an error.
|
171 |
+
elif new_data[:5] in csv_data.values.tolist():
|
172 |
+
warning_text = "Error! The same data already exists! Please fill revision_name."
|
173 |
+
print(warning_text)
|
174 |
+
return warning_text
|
175 |
+
|
176 |
+
csv_data.loc[col] = new_data
|
177 |
+
csv_data = csv_data.to_csv(csv_path, index=False)
|
178 |
+
|
179 |
+
absolute_result_path = os.path.abspath(csv_path)
|
180 |
+
if not os.path.exists(absolute_result_path):
|
181 |
+
raise FileNotFoundError(f"File {absolute_result_path} not found")
|
182 |
+
|
183 |
+
repo.git_pull()
|
184 |
+
repo.git_add(absolute_result_path)
|
185 |
+
|
186 |
+
save_path = os.path.join(CSV_QUEUE_DIR, f"{model_name_wo_link}.json")
|
187 |
+
with open(save_path, "wb") as f:
|
188 |
+
f.write(input_file)
|
189 |
+
|
190 |
+
absolute_queue_path = os.path.abspath(save_path)
|
191 |
+
|
192 |
+
repo.git_add(absolute_queue_path)
|
193 |
+
repo.git_commit(f"add {model_name_wo_link} results")
|
194 |
+
repo.git_push()
|
195 |
+
print(f"Success! Your {model_name_wo_link} has been added!")
|
196 |
+
|
197 |
+
return 0
|
198 |
+
|
199 |
+
|
200 |
+
def get_baseline_df():
|
201 |
+
repo.git_pull()
|
202 |
+
df = pd.read_csv(CSV_RESULT_PATH)
|
203 |
+
df = df.sort_values(by="Overall", ascending=False)
|
204 |
+
present_columns = MODEL_INFO + checkbox_group.value
|
205 |
+
df = df[present_columns]
|
206 |
+
return df
|
207 |
+
|
208 |
+
|
209 |
+
def get_all_df():
|
210 |
+
repo.git_pull()
|
211 |
+
df = pd.read_csv(CSV_RESULT_PATH)
|
212 |
+
df = df.sort_values(by="Overall", ascending=False)
|
213 |
+
return df
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
block = gr.Blocks()
|
218 |
+
|
219 |
+
|
220 |
+
with block:
|
221 |
+
gr.Markdown(
|
222 |
+
LEADERBORAD_INTRODUCTION
|
223 |
+
)
|
224 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
225 |
+
# table jmmmu bench
|
226 |
+
with gr.TabItem("π
JMMMU Benchmark", elem_id="jmmmu-benchmark-tab-table", id=1):
|
227 |
+
# selection for column part:
|
228 |
+
checkbox_group = gr.CheckboxGroup(
|
229 |
+
choices=TASK_INFO,
|
230 |
+
value=AVG_INFO,
|
231 |
+
label="Evaluation Dimension",
|
232 |
+
interactive=True,
|
233 |
+
) # user can select the evaluation dimension
|
234 |
+
|
235 |
+
with gr.Row():
|
236 |
+
# selection for model size part:
|
237 |
+
model_size = gr.CheckboxGroup(
|
238 |
+
choices=MODEL_SIZE,
|
239 |
+
value=MODEL_SIZE,
|
240 |
+
label="Model Size",
|
241 |
+
interactive=True,
|
242 |
+
)
|
243 |
+
|
244 |
+
baseline_value = get_baseline_df()
|
245 |
+
baseline_header = MODEL_INFO + checkbox_group.value
|
246 |
+
baseline_datatype = ['markdown'] * 2 + ['number'] * len(checkbox_group.value)
|
247 |
+
|
248 |
+
data_component = gr.components.Dataframe(
|
249 |
+
value=baseline_value,
|
250 |
+
headers=baseline_header,
|
251 |
+
type="pandas",
|
252 |
+
datatype=baseline_datatype,
|
253 |
+
interactive=False,
|
254 |
+
visible=True,
|
255 |
+
)
|
256 |
+
|
257 |
+
def on_filter_model_size_method_change(selected_model_size, selected_columns):
|
258 |
+
|
259 |
+
updated_data = get_all_df()
|
260 |
+
# model_size
|
261 |
+
|
262 |
+
def custom_filter(row, model_size_filters):
|
263 |
+
model_size = row['Model Size']
|
264 |
+
model_size = model_size.upper()
|
265 |
+
|
266 |
+
if model_size == '-':
|
267 |
+
size_filter = '-' in model_size_filters
|
268 |
+
elif 'B' in model_size:
|
269 |
+
size = float(model_size.replace('B', ''))
|
270 |
+
size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
|
271 |
+
else:
|
272 |
+
size_filter = False
|
273 |
+
|
274 |
+
return size_filter
|
275 |
+
|
276 |
+
mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size)
|
277 |
+
updated_data = updated_data[mask]
|
278 |
+
|
279 |
+
# columns:
|
280 |
+
selected_columns = [item for item in TASK_INFO if item in selected_columns]
|
281 |
+
present_columns = MODEL_INFO + selected_columns
|
282 |
+
updated_data = updated_data[present_columns]
|
283 |
+
updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
|
284 |
+
updated_headers = present_columns
|
285 |
+
update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
|
286 |
+
|
287 |
+
filter_component = gr.components.Dataframe(
|
288 |
+
value=updated_data,
|
289 |
+
headers=updated_headers,
|
290 |
+
type="pandas",
|
291 |
+
datatype=update_datatype,
|
292 |
+
interactive=False,
|
293 |
+
visible=True,
|
294 |
+
)
|
295 |
+
return filter_component
|
296 |
+
|
297 |
+
|
298 |
+
model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component)
|
299 |
+
checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component)
|
300 |
+
|
301 |
+
# table 5
|
302 |
+
with gr.TabItem("π Submit here! ", elem_id="jmmmu-benchmark-tab-table", id=5):
|
303 |
+
with gr.Row():
|
304 |
+
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
305 |
+
|
306 |
+
with gr.Row():
|
307 |
+
gr.Markdown("# βοΈβ¨ Submit your model evaluation json file here!", elem_classes="markdown-text")
|
308 |
+
|
309 |
+
with gr.Row():
|
310 |
+
with gr.Column():
|
311 |
+
model_type = gr.Dropdown(
|
312 |
+
choices=["LMM", "LLM"],
|
313 |
+
label="Model type",
|
314 |
+
multiselect=False,
|
315 |
+
value="LMM",
|
316 |
+
interactive=True,
|
317 |
+
)
|
318 |
+
model_name_textbox = gr.Textbox(
|
319 |
+
label="Model name", placeholder="LLaMA-7B"
|
320 |
+
)
|
321 |
+
revision_name_textbox = gr.Textbox(
|
322 |
+
label="Revision Model Name", placeholder="LLaMA-7B"
|
323 |
+
)
|
324 |
+
model_link = gr.Textbox(
|
325 |
+
label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
|
326 |
+
)
|
327 |
+
model_size = gr.Textbox(
|
328 |
+
label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
|
329 |
+
)
|
330 |
+
|
331 |
+
|
332 |
+
with gr.Column():
|
333 |
+
input_file = gr.components.File(label="Click to Upload a JSON File", file_count="single", type='binary')
|
334 |
+
submit_button = gr.Button("Submit Eval")
|
335 |
+
|
336 |
+
submission_result = gr.Markdown()
|
337 |
+
submit_button.click(
|
338 |
+
add_new_eval,
|
339 |
+
inputs = [
|
340 |
+
input_file,
|
341 |
+
model_type,
|
342 |
+
model_name_textbox,
|
343 |
+
revision_name_textbox,
|
344 |
+
model_link,
|
345 |
+
model_size
|
346 |
+
],
|
347 |
+
)
|
348 |
+
|
349 |
+
def refresh_data():
|
350 |
+
value = get_baseline_df()
|
351 |
+
|
352 |
+
return value
|
353 |
+
|
354 |
+
with gr.Row():
|
355 |
+
data_run = gr.Button("Refresh")
|
356 |
+
data_run.click(
|
357 |
+
refresh_data, outputs=[data_component]
|
358 |
+
)
|
359 |
+
|
360 |
+
with gr.Accordion("Citation", open=False):
|
361 |
+
citation_button = gr.Textbox(
|
362 |
+
value=CITATION_BUTTON_TEXT,
|
363 |
+
label=CITATION_BUTTON_LABEL,
|
364 |
+
elem_id="citation-button",
|
365 |
+
show_copy_button=True,
|
366 |
+
)
|
367 |
+
|
368 |
+
block.launch()
|
constants.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# this is .py for store constants
|
2 |
+
MODEL_INFO = ["Model Type", "Model"]
|
3 |
+
MODEL_SIZE = ["<10B", ">=10B", "-"]
|
4 |
+
LEADERBOARD_VERSION = ["Version1"]
|
5 |
+
TASK_INFO = ["Overall", "Culture-Specific", "Culture-Agnostic", "Japanese Art", "Japanese Heritage", "Japanese History", "World History", "Art & Psychology", "Business", "Science", "Health & Medicine", "Tech & Engineering"]
|
6 |
+
# Overall, Culture-Specific, Culture-Agnostic, English Original, Japanese Art, Japanese Heritage, Japanese History, World History, Art & Psychology, Business, Science, Health & Medicine, Tech & Engineering
|
7 |
+
AVG_INFO = ["Overall"]
|
8 |
+
|
9 |
+
|
10 |
+
DATA_TITILE_TYPE = ["markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
|
11 |
+
|
12 |
+
CSV_RESULT_PATH = "./download_from_dataset/result.csv"
|
13 |
+
CSV_QUEUE_DIR = "./download_from_dataset/queue"
|
14 |
+
COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
15 |
+
|
16 |
+
LEADERBORAD_VERSION = ["JMMMU"]
|
17 |
+
|
18 |
+
|
19 |
+
LEADERBORAD_INTRODUCTION = """
|
20 |
+
# JMMMU Leaderboard
|
21 |
+
|
22 |
+
[π **Homepage**](https://mmmu-japanese-benchmark.github.io/JMMMU/) | [π€ **Dataset**](https://huggingface.co/datasets/JMMMU/JMMMU/) | [π **HF Leaderboard**](https://huggingface.co/spaces/JMMMU/JMMMU_Leaderboard) | [π **arXiv (coming soon)**]() | [**GitHub**](https://github.com/EvolvingLMMs-Lab/lmms-eval)
|
23 |
+
|
24 |
+
|
25 |
+
### *"Which LMM is expert in Japanese subjects?"* π Welcome to the leaderboard of JMMMU
|
26 |
+
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
|
27 |
+
</div>
|
28 |
+
|
29 |
+
We introduce **JMMMU** (***Japanese MMMU***), a multimodal benchmark that can truly evaluate LMM performance in Japanese.
|
30 |
+
JMMMU consists of **720 translation-based (Culture Agnostic)** and **600 brand-new questions (Culture Specific)**, for a **total of 1,320 questions**, updating the size of the existing culture-aware Japanese benchmark by >10x.
|
31 |
+
|
32 |
+
"""
|
33 |
+
|
34 |
+
|
35 |
+
SUBMIT_INTRODUCTION = """# Submit on JMMMU Benchmark Introduction
|
36 |
+
1. Obtain Result JSON File from [lmms-eval code base](https://github.com/EvolvingLMMs-Lab/lmms-eval).
|
37 |
+
2. If you want to update existing model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify LLaVA-OV 7B's performance, you need to fill in 'LLaVA-OV 7B' in 'Revision Model Name'.
|
38 |
+
3. Please provide the correct link of your model's repository for each submission.
|
39 |
+
4. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
|
40 |
+
|
41 |
+
Note: The example of the submitted JSON file is this url: [result.json](https://drive.google.com/file/d/10CF1c24BhoK9OM8De-2gLXDDnNcnMWvy/view?usp=sharing).
|
42 |
+
|
43 |
+
## Submit Example
|
44 |
+
If you want to upload LLaVA-OV 7B's result in the leaderboard, you need to:
|
45 |
+
1. Select LMM in 'Model Type'.
|
46 |
+
2. Fill in 'LLaVA-OV 7B' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
|
47 |
+
3. Fill in 'LLaVA-OV 7B' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
|
48 |
+
4. Fill in 'https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov' in 'Model Link'.
|
49 |
+
5. Fill in '7B' in 'Model size'.
|
50 |
+
10. Upload results.json.
|
51 |
+
11. Click the 'Submit Eval' button.
|
52 |
+
12. Click 'Refresh' to obtain the uploaded leaderboard.
|
53 |
+
|
54 |
+
To check whether the submission is successful, you can click the 'Logs' button. If the message 'Success! Your submission has been added!' appears, the submission is successful.
|
55 |
+
|
56 |
+
### If you have any questions or deletion requests, please contact [miyai@cvm.t.u-tokyo.ac.jp](miyai@cvm.t.u-tokyo.ac.jp).
|
57 |
+
### β οΈ Please do not submit any malicious file (e.g, files you manually edited).
|
58 |
+
"""
|
59 |
+
|
60 |
+
|
61 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
62 |
+
CITATION_BUTTON_TEXT = r"""@misc{onohara2024jmmmu,
|
63 |
+
title={JMMMU: A Japanese Massive Multi-discipline Multimodal Understanding Benchmark},
|
64 |
+
author={Shota Onohara and Atsuyuki Miyai and Yuki Imajuku and Kazuki Egashira and Jeonghun Baek and Xiang Yue and Graham Neubig and Kiyoharu Aizawa},
|
65 |
+
url={https://huggingface.co/datasets/JMMMU/JMMMU},
|
66 |
+
year={2024}
|
67 |
+
}"""
|
download_from_dataset
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit fd8fab23dd3d21d86514740c10690fbe6f30ae31
|
pyproject.toml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.ruff]
|
2 |
+
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
3 |
+
select = ["E", "F"]
|
4 |
+
ignore = ["E501"] # line too long (black is taking care of this)
|
5 |
+
line-length = 119
|
6 |
+
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
7 |
+
|
8 |
+
[tool.isort]
|
9 |
+
profile = "black"
|
10 |
+
line_length = 119
|
11 |
+
|
12 |
+
[tool.black]
|
13 |
+
line-length = 119
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
APScheduler==3.10.1
|
2 |
+
black==23.11.0
|
3 |
+
click==8.1.3
|
4 |
+
datasets==2.14.5
|
5 |
+
gradio==4.4.0
|
6 |
+
gradio_client==0.7.0
|
7 |
+
huggingface-hub>=0.23.2
|
8 |
+
matplotlib==3.7.1
|
9 |
+
numpy==1.24.2
|
10 |
+
pandas==2.0.0
|
11 |
+
python-dateutil==2.8.2
|
12 |
+
requests==2.28.2
|
13 |
+
tqdm==4.65.0
|
14 |
+
transformers==4.35.2
|
15 |
+
tokenizers>=0.15.0
|
16 |
+
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
+
accelerate==0.24.1
|
18 |
+
sentencepiece
|
19 |
+
openpyxl
|