Weyaxi commited on
Commit
b787f43
1 Parent(s): 2ccf9b0

Multi-file test (#9)

Browse files

- Multi-file test (1436ea6ac7b4526f3ac1beed6281897dcc29c2bb)

Files changed (2) hide show
  1. app.py +3 -195
  2. functions.py +189 -0
app.py CHANGED
@@ -1,19 +1,13 @@
1
  import os
2
  import time
3
  os.system("wget https://raw.githubusercontent.com/Weyaxi/scrape-open-llm-leaderboard/main/openllm.py")
4
- from huggingface_hub import CommitOperationAdd, create_commit, HfApi, HfFileSystem, RepoUrl
5
- from huggingface_hub import ModelCardData, EvalResult, ModelCard
6
- from huggingface_hub.repocard_data import eval_results_to_model_index
7
- from huggingface_hub.repocard import RepoCard
8
- from openllm import get_json_format_data, get_datas
9
- from tqdm import tqdm
10
  import time
11
- import requests
12
  import pandas as pd
13
- from pytablewriter import MarkdownTableWriter
14
  import threading
15
  import gradio as gr
16
  from gradio_space_ci import enable_space_ci
 
17
 
18
  enable_space_ci()
19
 
@@ -24,200 +18,14 @@ BOT_HF_TOKEN = os.getenv('BOT_HF_TOKEN')
24
  api = HfApi()
25
  fs = HfFileSystem()
26
 
27
- data = get_json_format_data()
28
- finished_models = get_datas(data)
29
- df = pd.DataFrame(finished_models)
30
-
31
-
32
  def refresh(how_much=3600): # default to 1 hour
33
- global data, finished_models, df
34
  time.sleep(how_much)
35
-
36
  try:
37
- data = get_json_format_data()
38
- finished_models = get_datas(data)
39
- df = pd.DataFrame(finished_models)
40
  except Exception as e:
41
  print(f"Error while scraping leaderboard, trying again... {e}")
42
  refresh(600) # 10 minutes if any error happens
43
 
44
-
45
- def search(df, value):
46
- result_df = df[df["Model"] == value]
47
- return result_df.iloc[0].to_dict() if not result_df.empty else None
48
-
49
-
50
- def get_details_url(repo):
51
- author, model = repo.split("/")
52
- return f"https://huggingface.co/datasets/open-llm-leaderboard/details_{author}__{model}"
53
-
54
-
55
- def get_query_url(repo):
56
- return f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query={repo}"
57
-
58
-
59
- desc = """
60
- This is an automated PR created with https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr
61
-
62
- The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
63
-
64
- If you encounter any issues, please report them to https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr/discussions
65
- """
66
-
67
-
68
- def get_task_summary(results):
69
- return {
70
- "ARC":
71
- {"dataset_type":"ai2_arc",
72
- "dataset_name":"AI2 Reasoning Challenge (25-Shot)",
73
- "metric_type":"acc_norm",
74
- "metric_value":results["ARC"],
75
- "dataset_config":"ARC-Challenge",
76
- "dataset_split":"test",
77
- "dataset_revision":None,
78
- "dataset_args":{"num_few_shot": 25},
79
- "metric_name":"normalized accuracy"
80
- },
81
- "HellaSwag":
82
- {"dataset_type":"hellaswag",
83
- "dataset_name":"HellaSwag (10-Shot)",
84
- "metric_type":"acc_norm",
85
- "metric_value":results["HellaSwag"],
86
- "dataset_config":None,
87
- "dataset_split":"validation",
88
- "dataset_revision":None,
89
- "dataset_args":{"num_few_shot": 10},
90
- "metric_name":"normalized accuracy"
91
- },
92
- "MMLU":
93
- {
94
- "dataset_type":"cais/mmlu",
95
- "dataset_name":"MMLU (5-Shot)",
96
- "metric_type":"acc",
97
- "metric_value":results["MMLU"],
98
- "dataset_config":"all",
99
- "dataset_split":"test",
100
- "dataset_revision":None,
101
- "dataset_args":{"num_few_shot": 5},
102
- "metric_name":"accuracy"
103
- },
104
- "TruthfulQA":
105
- {
106
- "dataset_type":"truthful_qa",
107
- "dataset_name":"TruthfulQA (0-shot)",
108
- "metric_type":"mc2",
109
- "metric_value":results["TruthfulQA"],
110
- "dataset_config":"multiple_choice",
111
- "dataset_split":"validation",
112
- "dataset_revision":None,
113
- "dataset_args":{"num_few_shot": 0},
114
- "metric_name":None
115
- },
116
- "Winogrande":
117
- {
118
- "dataset_type":"winogrande",
119
- "dataset_name":"Winogrande (5-shot)",
120
- "metric_type":"acc",
121
- "metric_value":results["Winogrande"],
122
- "dataset_config":"winogrande_xl",
123
- "dataset_split":"validation",
124
- "dataset_args":{"num_few_shot": 5},
125
- "metric_name":"accuracy"
126
- },
127
- "GSM8K":
128
- {
129
- "dataset_type":"gsm8k",
130
- "dataset_name":"GSM8k (5-shot)",
131
- "metric_type":"acc",
132
- "metric_value":results["GSM8K"],
133
- "dataset_config":"main",
134
- "dataset_split":"test",
135
- "dataset_args":{"num_few_shot": 5},
136
- "metric_name":"accuracy"
137
- }
138
- }
139
-
140
-
141
-
142
- def get_eval_results(repo):
143
- results = search(df, repo)
144
- task_summary = get_task_summary(results)
145
- md_writer = MarkdownTableWriter()
146
- md_writer.headers = ["Metric", "Value"]
147
- md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
148
-
149
- text = f"""
150
- # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
151
- Detailed results can be found [here]({get_details_url(repo)})
152
-
153
- {md_writer.dumps()}
154
- """
155
- return text
156
-
157
-
158
- def get_edited_yaml_readme(repo, token: str | None):
159
- card = ModelCard.load(repo, token=token)
160
- results = search(df, repo)
161
-
162
- common = {"task_type": 'text-generation', "task_name": 'Text Generation', "source_name": "Open LLM Leaderboard", "source_url": f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query={repo}"}
163
-
164
- tasks_results = get_task_summary(results)
165
-
166
- if not card.data['eval_results']: # No results reported yet, we initialize the metadata
167
- card.data["model-index"] = eval_results_to_model_index(repo.split('/')[1], [EvalResult(**task, **common) for task in tasks_results.values()])
168
- else: # We add the new evaluations
169
- for task in tasks_results.values():
170
- cur_result = EvalResult(**task, **common)
171
- if any(result.is_equal_except_value(cur_result) for result in card.data['eval_results']):
172
- continue
173
- card.data['eval_results'].append(cur_result)
174
-
175
- return str(card)
176
-
177
-
178
- def commit(repo, pr_number=None, message="Adding Evaluation Results", oauth_token: gr.OAuthToken | None = None): # specify pr number if you want to edit it, don't if you don't want
179
- if oauth_token is None:
180
- gr.Warning("You are not logged in; therefore, the leaderboard-pr-bot will open the pull request instead of you. Click on 'Sign in with Huggingface' to log in.")
181
- token = BOT_HF_TOKEN
182
- elif oauth_token.expires_at < time.time():
183
- raise gr.Error("Token expired. Logout and try again.")
184
- else:
185
- token = oauth_token.token
186
-
187
- if repo.startswith("https://huggingface.co/"):
188
- try:
189
- repo = RepoUrl(repo).repo_id
190
- except Exception:
191
- raise gr.Error(f"Not a valid repo id: {str(repo)}")
192
-
193
- edited = {"revision": f"refs/pr/{pr_number}"} if pr_number else {"create_pr": True}
194
-
195
- try:
196
- try: # check if there is a readme already
197
- readme_text = get_edited_yaml_readme(repo, token=token) + get_eval_results(repo)
198
- except Exception as e:
199
- if "Repo card metadata block was not found." in str(e): # There is no readme
200
- readme_text = get_edited_yaml_readme(repo, token=token)
201
- else:
202
- print(f"Something went wrong: {e}")
203
-
204
- liste = [CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=readme_text.encode())]
205
- commit = (create_commit(repo_id=repo, token=token, operations=liste, commit_message=message, commit_description=desc, repo_type="model", **edited).pr_url)
206
-
207
- return commit
208
-
209
- except Exception as e:
210
-
211
- if "Discussions are disabled for this repo" in str(e):
212
- return "Discussions disabled"
213
- elif "Cannot access gated repo" in str(e):
214
- return "Gated repo"
215
- elif "Repository Not Found" in str(e):
216
- return "Repository Not Found"
217
- else:
218
- return e
219
-
220
-
221
  gradio_title="🧐 Open LLM Leaderboard Results PR Opener"
222
  gradio_desc= """🎯 This tool's aim is to provide [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) results in the model card.
223
 
 
1
  import os
2
  import time
3
  os.system("wget https://raw.githubusercontent.com/Weyaxi/scrape-open-llm-leaderboard/main/openllm.py")
4
+ from huggingface_hub import HfApi, HfFileSystem
 
 
 
 
 
5
  import time
 
6
  import pandas as pd
 
7
  import threading
8
  import gradio as gr
9
  from gradio_space_ci import enable_space_ci
10
+ from functions import commit
11
 
12
  enable_space_ci()
13
 
 
18
  api = HfApi()
19
  fs = HfFileSystem()
20
 
 
 
 
 
 
21
  def refresh(how_much=3600): # default to 1 hour
 
22
  time.sleep(how_much)
 
23
  try:
24
+ api.restart_space(repo_id="Weyaxi/leaderboard-results-to-modelcard")
 
 
25
  except Exception as e:
26
  print(f"Error while scraping leaderboard, trying again... {e}")
27
  refresh(600) # 10 minutes if any error happens
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  gradio_title="🧐 Open LLM Leaderboard Results PR Opener"
30
  gradio_desc= """🎯 This tool's aim is to provide [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) results in the model card.
31
 
functions.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import CommitOperationAdd, create_commit, RepoUrl
3
+ from huggingface_hub import EvalResult, ModelCard
4
+ from huggingface_hub.repocard_data import eval_results_to_model_index
5
+ import time
6
+ from pytablewriter import MarkdownTableWriter
7
+ import gradio as gr
8
+ from openllm import get_json_format_data, get_datas
9
+ import pandas as pd
10
+
11
+ BOT_HF_TOKEN = os.getenv('BOT_HF_TOKEN')
12
+
13
+ data = get_json_format_data()
14
+ finished_models = get_datas(data)
15
+ df = pd.DataFrame(finished_models)
16
+
17
+ desc = """
18
+ This is an automated PR created with https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr
19
+
20
+ The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
21
+
22
+ If you encounter any issues, please report them to https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr/discussions
23
+ """
24
+
25
+ def search(df, value):
26
+ result_df = df[df["Model"] == value]
27
+ return result_df.iloc[0].to_dict() if not result_df.empty else None
28
+
29
+
30
+ def get_details_url(repo):
31
+ author, model = repo.split("/")
32
+ return f"https://huggingface.co/datasets/open-llm-leaderboard/details_{author}__{model}"
33
+
34
+
35
+ def get_query_url(repo):
36
+ return f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query={repo}"
37
+
38
+
39
+ def get_task_summary(results):
40
+ return {
41
+ "ARC":
42
+ {"dataset_type":"ai2_arc",
43
+ "dataset_name":"AI2 Reasoning Challenge (25-Shot)",
44
+ "metric_type":"acc_norm",
45
+ "metric_value":results["ARC"],
46
+ "dataset_config":"ARC-Challenge",
47
+ "dataset_split":"test",
48
+ "dataset_revision":None,
49
+ "dataset_args":{"num_few_shot": 25},
50
+ "metric_name":"normalized accuracy"
51
+ },
52
+ "HellaSwag":
53
+ {"dataset_type":"hellaswag",
54
+ "dataset_name":"HellaSwag (10-Shot)",
55
+ "metric_type":"acc_norm",
56
+ "metric_value":results["HellaSwag"],
57
+ "dataset_config":None,
58
+ "dataset_split":"validation",
59
+ "dataset_revision":None,
60
+ "dataset_args":{"num_few_shot": 10},
61
+ "metric_name":"normalized accuracy"
62
+ },
63
+ "MMLU":
64
+ {
65
+ "dataset_type":"cais/mmlu",
66
+ "dataset_name":"MMLU (5-Shot)",
67
+ "metric_type":"acc",
68
+ "metric_value":results["MMLU"],
69
+ "dataset_config":"all",
70
+ "dataset_split":"test",
71
+ "dataset_revision":None,
72
+ "dataset_args":{"num_few_shot": 5},
73
+ "metric_name":"accuracy"
74
+ },
75
+ "TruthfulQA":
76
+ {
77
+ "dataset_type":"truthful_qa",
78
+ "dataset_name":"TruthfulQA (0-shot)",
79
+ "metric_type":"mc2",
80
+ "metric_value":results["TruthfulQA"],
81
+ "dataset_config":"multiple_choice",
82
+ "dataset_split":"validation",
83
+ "dataset_revision":None,
84
+ "dataset_args":{"num_few_shot": 0},
85
+ "metric_name":None
86
+ },
87
+ "Winogrande":
88
+ {
89
+ "dataset_type":"winogrande",
90
+ "dataset_name":"Winogrande (5-shot)",
91
+ "metric_type":"acc",
92
+ "metric_value":results["Winogrande"],
93
+ "dataset_config":"winogrande_xl",
94
+ "dataset_split":"validation",
95
+ "dataset_args":{"num_few_shot": 5},
96
+ "metric_name":"accuracy"
97
+ },
98
+ "GSM8K":
99
+ {
100
+ "dataset_type":"gsm8k",
101
+ "dataset_name":"GSM8k (5-shot)",
102
+ "metric_type":"acc",
103
+ "metric_value":results["GSM8K"],
104
+ "dataset_config":"main",
105
+ "dataset_split":"test",
106
+ "dataset_args":{"num_few_shot": 5},
107
+ "metric_name":"accuracy"
108
+ }
109
+ }
110
+
111
+
112
+
113
+ def get_eval_results(repo):
114
+ results = search(df, repo)
115
+ task_summary = get_task_summary(results)
116
+ md_writer = MarkdownTableWriter()
117
+ md_writer.headers = ["Metric", "Value"]
118
+ md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
119
+
120
+ text = f"""
121
+ # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
122
+ Detailed results can be found [here]({get_details_url(repo)})
123
+
124
+ {md_writer.dumps()}
125
+ """
126
+ return text
127
+
128
+
129
+ def get_edited_yaml_readme(repo, token: str | None):
130
+ card = ModelCard.load(repo, token=token)
131
+ results = search(df, repo)
132
+
133
+ common = {"task_type": 'text-generation', "task_name": 'Text Generation', "source_name": "Open LLM Leaderboard", "source_url": f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query={repo}"}
134
+
135
+ tasks_results = get_task_summary(results)
136
+
137
+ if not card.data['eval_results']: # No results reported yet, we initialize the metadata
138
+ card.data["model-index"] = eval_results_to_model_index(repo.split('/')[1], [EvalResult(**task, **common) for task in tasks_results.values()])
139
+ else: # We add the new evaluations
140
+ for task in tasks_results.values():
141
+ cur_result = EvalResult(**task, **common)
142
+ if any(result.is_equal_except_value(cur_result) for result in card.data['eval_results']):
143
+ continue
144
+ card.data['eval_results'].append(cur_result)
145
+
146
+ return str(card)
147
+
148
+
149
+ def commit(repo, pr_number=None, message="Adding Evaluation Results", oauth_token: gr.OAuthToken | None = None): # specify pr number if you want to edit it, don't if you don't want
150
+ if oauth_token is None:
151
+ gr.Warning("You are not logged in; therefore, the leaderboard-pr-bot will open the pull request instead of you. Click on 'Sign in with Huggingface' to log in.")
152
+ token = BOT_HF_TOKEN
153
+ elif oauth_token.expires_at < time.time():
154
+ raise gr.Error("Token expired. Logout and try again.")
155
+ else:
156
+ token = oauth_token.token
157
+
158
+ if repo.startswith("https://huggingface.co/"):
159
+ try:
160
+ repo = RepoUrl(repo).repo_id
161
+ except Exception:
162
+ raise gr.Error(f"Not a valid repo id: {str(repo)}")
163
+
164
+ edited = {"revision": f"refs/pr/{pr_number}"} if pr_number else {"create_pr": True}
165
+
166
+ try:
167
+ try: # check if there is a readme already
168
+ readme_text = get_edited_yaml_readme(repo, token=token) + get_eval_results(repo)
169
+ except Exception as e:
170
+ if "Repo card metadata block was not found." in str(e): # There is no readme
171
+ readme_text = get_edited_yaml_readme(repo, token=token)
172
+ else:
173
+ print(f"Something went wrong: {e}")
174
+
175
+ liste = [CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=readme_text.encode())]
176
+ commit = (create_commit(repo_id=repo, token=token, operations=liste, commit_message=message, commit_description=desc, repo_type="model", **edited).pr_url)
177
+
178
+ return commit
179
+
180
+ except Exception as e:
181
+
182
+ if "Discussions are disabled for this repo" in str(e):
183
+ return "Discussions disabled"
184
+ elif "Cannot access gated repo" in str(e):
185
+ return "Gated repo"
186
+ elif "Repository Not Found" in str(e):
187
+ return "Repository Not Found"
188
+ else:
189
+ return e