eduagarcia
commited on
Commit
•
9b95b87
1
Parent(s):
b787f43
Adapt code to work with the Open Portuguese LLM leaderboard
Browse files- app.py +9 -7
- functions.py +98 -67
- openllm.py +44 -0
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import os
|
2 |
import time
|
3 |
-
os.system("wget https://raw.githubusercontent.com/Weyaxi/scrape-open-llm-leaderboard/main/openllm.py")
|
4 |
from huggingface_hub import HfApi, HfFileSystem
|
5 |
import time
|
6 |
import pandas as pd
|
@@ -21,17 +20,20 @@ fs = HfFileSystem()
|
|
21 |
def refresh(how_much=3600): # default to 1 hour
|
22 |
time.sleep(how_much)
|
23 |
try:
|
24 |
-
api.restart_space(repo_id="
|
25 |
except Exception as e:
|
26 |
print(f"Error while scraping leaderboard, trying again... {e}")
|
27 |
refresh(600) # 10 minutes if any error happens
|
28 |
|
29 |
-
gradio_title="🧐 Open LLM Leaderboard Results PR Opener"
|
30 |
-
gradio_desc= """
|
|
|
|
|
|
|
31 |
|
32 |
## 💭 What Does This Tool Do:
|
33 |
|
34 |
-
- This tool adds the [Open LLM Leaderboard](https://huggingface.co/spaces/
|
35 |
|
36 |
- This tool also adds evaluation results as your model's metadata to showcase the evaluation results as a widget.
|
37 |
|
@@ -41,9 +43,9 @@ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://hugg
|
|
41 |
|
42 |
## 🤝 Acknowledgements
|
43 |
|
44 |
-
-
|
|
|
45 |
|
46 |
-
- Special thanks to [Lucain Pouget (Wauplin)](https://huggingface.co/Wauplin) for assisting with the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api).
|
47 |
"""
|
48 |
|
49 |
with gr.Blocks() as demo:
|
|
|
1 |
import os
|
2 |
import time
|
|
|
3 |
from huggingface_hub import HfApi, HfFileSystem
|
4 |
import time
|
5 |
import pandas as pd
|
|
|
20 |
def refresh(how_much=3600): # default to 1 hour
|
21 |
time.sleep(how_much)
|
22 |
try:
|
23 |
+
api.restart_space(repo_id="eduagarcia-temp/portuguese-leaderboard-results-to-modelcard")
|
24 |
except Exception as e:
|
25 |
print(f"Error while scraping leaderboard, trying again... {e}")
|
26 |
refresh(600) # 10 minutes if any error happens
|
27 |
|
28 |
+
gradio_title="🧐 Open Portuguese LLM Leaderboard Results PR Opener"
|
29 |
+
gradio_desc= """
|
30 |
+
This a fork of the [🧐 Open LLM Leaderboard Results PR Opener
|
31 |
+
](https://huggingface.co/spaces/Weyaxi/leaderboard-results-to-modelcard) from [@Weyaxi](https://huggingface.co/Weyaxi) modfied to work with the [Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard).
|
32 |
+
🎯 This tool's aim is to provide [Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard) results in the model card.
|
33 |
|
34 |
## 💭 What Does This Tool Do:
|
35 |
|
36 |
+
- This tool adds the [Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard) result of your model at the end of your model card.
|
37 |
|
38 |
- This tool also adds evaluation results as your model's metadata to showcase the evaluation results as a widget.
|
39 |
|
|
|
43 |
|
44 |
## 🤝 Acknowledgements
|
45 |
|
46 |
+
- Thanks to [Yağız Çalık (Weyaxi)](https://huggingface.co/Weyaxi) for creating the original [🧐 Open LLM Leaderboard Results PR Opener
|
47 |
+
](https://huggingface.co/spaces/Weyaxi/leaderboard-results-to-modelcard) tool.
|
48 |
|
|
|
49 |
"""
|
50 |
|
51 |
with gr.Blocks() as demo:
|
functions.py
CHANGED
@@ -15,11 +15,11 @@ finished_models = get_datas(data)
|
|
15 |
df = pd.DataFrame(finished_models)
|
16 |
|
17 |
desc = """
|
18 |
-
This is an automated PR created with https://huggingface.co/spaces/
|
19 |
|
20 |
-
The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
|
21 |
|
22 |
-
If you encounter any issues, please report them to https://huggingface.co/spaces/
|
23 |
"""
|
24 |
|
25 |
def search(df, value):
|
@@ -28,84 +28,115 @@ def search(df, value):
|
|
28 |
|
29 |
|
30 |
def get_details_url(repo):
|
31 |
-
author, model = repo.split("/")
|
32 |
-
return f"https://huggingface.co/datasets/
|
33 |
|
34 |
|
35 |
def get_query_url(repo):
|
36 |
-
return f"https://huggingface.co/spaces/
|
37 |
|
38 |
|
39 |
def get_task_summary(results):
|
40 |
return {
|
41 |
-
"
|
42 |
-
{"dataset_type":"
|
43 |
-
"dataset_name":"
|
44 |
-
"metric_type":"
|
45 |
-
"metric_value":results["
|
46 |
-
"dataset_config":
|
47 |
-
"dataset_split":"
|
48 |
"dataset_revision":None,
|
49 |
-
"dataset_args":{"num_few_shot":
|
50 |
-
"metric_name":"
|
51 |
},
|
52 |
-
"
|
53 |
-
{"dataset_type":"
|
54 |
-
"dataset_name":"
|
55 |
-
"metric_type":"
|
56 |
-
"metric_value":results["
|
57 |
-
"dataset_config":None,
|
58 |
-
"dataset_split":"
|
59 |
"dataset_revision":None,
|
60 |
-
"dataset_args":{"num_few_shot":
|
61 |
-
"metric_name":"
|
62 |
},
|
63 |
-
"
|
64 |
-
|
65 |
-
"
|
66 |
-
"dataset_name":"MMLU (5-Shot)",
|
67 |
"metric_type":"acc",
|
68 |
-
"metric_value":results["
|
69 |
-
"dataset_config":
|
70 |
-
"dataset_split":"
|
71 |
"dataset_revision":None,
|
72 |
-
"dataset_args":{"num_few_shot":
|
73 |
"metric_name":"accuracy"
|
74 |
-
|
75 |
-
"
|
76 |
-
|
77 |
-
"
|
78 |
-
"
|
79 |
-
"
|
80 |
-
"
|
81 |
-
"
|
82 |
-
"dataset_split":"validation",
|
83 |
"dataset_revision":None,
|
84 |
-
"dataset_args":{"num_few_shot":
|
85 |
-
"metric_name":
|
86 |
-
|
87 |
-
"
|
88 |
-
|
89 |
-
"
|
90 |
-
"
|
91 |
-
"
|
92 |
-
"
|
93 |
-
"dataset_config":"winogrande_xl",
|
94 |
-
"dataset_split":"validation",
|
95 |
-
"dataset_args":{"num_few_shot": 5},
|
96 |
-
"metric_name":"accuracy"
|
97 |
-
},
|
98 |
-
"GSM8K":
|
99 |
-
{
|
100 |
-
"dataset_type":"gsm8k",
|
101 |
-
"dataset_name":"GSM8k (5-shot)",
|
102 |
-
"metric_type":"acc",
|
103 |
-
"metric_value":results["GSM8K"],
|
104 |
-
"dataset_config":"main",
|
105 |
"dataset_split":"test",
|
106 |
-
"
|
107 |
-
"
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
}
|
110 |
|
111 |
|
@@ -118,7 +149,7 @@ def get_eval_results(repo):
|
|
118 |
md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
|
119 |
|
120 |
text = f"""
|
121 |
-
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/
|
122 |
Detailed results can be found [here]({get_details_url(repo)})
|
123 |
|
124 |
{md_writer.dumps()}
|
@@ -130,7 +161,7 @@ def get_edited_yaml_readme(repo, token: str | None):
|
|
130 |
card = ModelCard.load(repo, token=token)
|
131 |
results = search(df, repo)
|
132 |
|
133 |
-
common = {"task_type": 'text-generation', "task_name": 'Text Generation', "source_name": "Open LLM Leaderboard", "source_url":
|
134 |
|
135 |
tasks_results = get_task_summary(results)
|
136 |
|
|
|
15 |
df = pd.DataFrame(finished_models)
|
16 |
|
17 |
desc = """
|
18 |
+
This is an automated PR created with https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard
|
19 |
|
20 |
+
The purpose of this PR is to add evaluation results from the Open Portuguese LLM Leaderboard to your model card.
|
21 |
|
22 |
+
If you encounter any issues, please report them to https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard/discussions
|
23 |
"""
|
24 |
|
25 |
def search(df, value):
|
|
|
28 |
|
29 |
|
30 |
def get_details_url(repo):
|
31 |
+
#author, model = repo.split("/")
|
32 |
+
return f"https://huggingface.co/datasets/eduagarcia-temp/llm_pt_leaderboard_raw_results/tree/main/{repo}"
|
33 |
|
34 |
|
35 |
def get_query_url(repo):
|
36 |
+
return f"https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query={repo}"
|
37 |
|
38 |
|
39 |
def get_task_summary(results):
|
40 |
return {
|
41 |
+
"ENEM":
|
42 |
+
{"dataset_type":"enem_challenge",
|
43 |
+
"dataset_name":"ENEM Challenge",
|
44 |
+
"metric_type":"acc",
|
45 |
+
"metric_value":results["ENEM"],
|
46 |
+
"dataset_config": None,
|
47 |
+
"dataset_split":"train",
|
48 |
"dataset_revision":None,
|
49 |
+
"dataset_args":{"num_few_shot": 3},
|
50 |
+
"metric_name":"accuracy"
|
51 |
},
|
52 |
+
"BLUEX":
|
53 |
+
{"dataset_type":"bluex",
|
54 |
+
"dataset_name":"BLUEX",
|
55 |
+
"metric_type":"acc",
|
56 |
+
"metric_value":results["BLUEX"],
|
57 |
+
"dataset_config": None,
|
58 |
+
"dataset_split":"train",
|
59 |
"dataset_revision":None,
|
60 |
+
"dataset_args":{"num_few_shot": 3},
|
61 |
+
"metric_name":"accuracy"
|
62 |
},
|
63 |
+
"OAB Exams":
|
64 |
+
{"dataset_type":"oab_exams",
|
65 |
+
"dataset_name":"OAB Exams",
|
|
|
66 |
"metric_type":"acc",
|
67 |
+
"metric_value":results["OAB Exams"],
|
68 |
+
"dataset_config": None,
|
69 |
+
"dataset_split":"train",
|
70 |
"dataset_revision":None,
|
71 |
+
"dataset_args":{"num_few_shot": 3},
|
72 |
"metric_name":"accuracy"
|
73 |
+
},
|
74 |
+
"ASSIN2 RTE":
|
75 |
+
{"dataset_type":"assin2_rte",
|
76 |
+
"dataset_name":"ASSIN2 RTE",
|
77 |
+
"metric_type":"f1_macro",
|
78 |
+
"metric_value":results["ASSIN2 RTE"],
|
79 |
+
"dataset_config": None,
|
80 |
+
"dataset_split":"test",
|
|
|
81 |
"dataset_revision":None,
|
82 |
+
"dataset_args":{"num_few_shot": 15},
|
83 |
+
"metric_name":"f1-macro"
|
84 |
+
},
|
85 |
+
"ASSIN2 STS":
|
86 |
+
{"dataset_type":"assin2_sts",
|
87 |
+
"dataset_name":"ASSIN2 STS",
|
88 |
+
"metric_type":"pearson",
|
89 |
+
"metric_value":results["ASSIN2 STS"],
|
90 |
+
"dataset_config": None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
"dataset_split":"test",
|
92 |
+
"dataset_revision":None,
|
93 |
+
"dataset_args":{"num_few_shot": 15},
|
94 |
+
"metric_name":"pearson"
|
95 |
+
},
|
96 |
+
"FAQUAD NLI":
|
97 |
+
{"dataset_type":"fquad_nli",
|
98 |
+
"dataset_name":"FAQUAD NLI",
|
99 |
+
"metric_type":"f1_macro",
|
100 |
+
"metric_value":results["FAQUAD NLI"],
|
101 |
+
"dataset_config": None,
|
102 |
+
"dataset_split":"test",
|
103 |
+
"dataset_revision":None,
|
104 |
+
"dataset_args":{"num_few_shot": 15},
|
105 |
+
"metric_name":"f1-macro"
|
106 |
+
},
|
107 |
+
"HateBR":
|
108 |
+
{"dataset_type":"hatebr_offensive",
|
109 |
+
"dataset_name":"HateBR",
|
110 |
+
"metric_type":"f1_macro",
|
111 |
+
"metric_value":results["HateBR"],
|
112 |
+
"dataset_config": None,
|
113 |
+
"dataset_split":"test",
|
114 |
+
"dataset_revision":None,
|
115 |
+
"dataset_args":{"num_few_shot": 25},
|
116 |
+
"metric_name":"f1-macro"
|
117 |
+
},
|
118 |
+
"PT Hate Speech":
|
119 |
+
{"dataset_type":"portuguese_hate_speech",
|
120 |
+
"dataset_name":"PT Hate Speech",
|
121 |
+
"metric_type":"f1_macro",
|
122 |
+
"metric_value":results["PT Hate Speech"],
|
123 |
+
"dataset_config": None,
|
124 |
+
"dataset_split":"test",
|
125 |
+
"dataset_revision":None,
|
126 |
+
"dataset_args":{"num_few_shot": 25},
|
127 |
+
"metric_name":"f1-macro"
|
128 |
+
},
|
129 |
+
"tweetSentBR":
|
130 |
+
{"dataset_type":"tweetsentbr",
|
131 |
+
"dataset_name":"tweetSentBR",
|
132 |
+
"metric_type":"f1_macro",
|
133 |
+
"metric_value":results["tweetSentBR"],
|
134 |
+
"dataset_config": None,
|
135 |
+
"dataset_split":"test",
|
136 |
+
"dataset_revision":None,
|
137 |
+
"dataset_args":{"num_few_shot": 25},
|
138 |
+
"metric_name":"f1-macro"
|
139 |
+
}
|
140 |
}
|
141 |
|
142 |
|
|
|
149 |
md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
|
150 |
|
151 |
text = f"""
|
152 |
+
# [Open Portuguese LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
|
153 |
Detailed results can be found [here]({get_details_url(repo)})
|
154 |
|
155 |
{md_writer.dumps()}
|
|
|
161 |
card = ModelCard.load(repo, token=token)
|
162 |
results = search(df, repo)
|
163 |
|
164 |
+
common = {"task_type": 'text-generation', "task_name": 'Text Generation', "source_name": "Open Portuguese LLM Leaderboard", "source_url": get_query_url(repo)}
|
165 |
|
166 |
tasks_results = get_task_summary(results)
|
167 |
|
openllm.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
|
6 |
+
|
7 |
+
def get_json_format_data():
|
8 |
+
url = 'https://eduagarcia-open-pt-llm-leaderboard.hf.space/'
|
9 |
+
response = requests.get(url)
|
10 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
11 |
+
|
12 |
+
script_elements = soup.find_all('script')
|
13 |
+
json_format_data = json.loads(str(script_elements[1])[31:-10])
|
14 |
+
return json_format_data
|
15 |
+
|
16 |
+
|
17 |
+
def get_datas(data):
|
18 |
+
for component_index in range(10, 50, 1): # component_index sometimes changes when they update the space, we can use this "for" loop to avoid changing component index manually
|
19 |
+
try:
|
20 |
+
result_list = []
|
21 |
+
i = 0
|
22 |
+
while True:
|
23 |
+
try:
|
24 |
+
results = data['components'][component_index]['props']['value']['data'][i]
|
25 |
+
columns = data['components'][component_index]['props']['headers']
|
26 |
+
try:
|
27 |
+
results_json = {"T": results[0], "Model": results[-1]}
|
28 |
+
|
29 |
+
if len(columns) < 15: # If there are less than 15 columns (this number can definetly change), we know that we are trying wrong component index, so breaking loop to try next component index.
|
30 |
+
break
|
31 |
+
|
32 |
+
for col_index, col_name in enumerate(columns[2:-1], start=2):
|
33 |
+
results_json[col_name] = results[col_index]
|
34 |
+
|
35 |
+
except IndexError: # Wrong component index, so breaking loop to try next component index. (NOTE: More than one component index can give you some results but we must find the right component index to get all results we want.)
|
36 |
+
break
|
37 |
+
result_list.append(results_json)
|
38 |
+
i += 1
|
39 |
+
except IndexError: # No rows to extract so return the list (We know it is the right component index because we didn't break out of loop on the other exception.)
|
40 |
+
return result_list
|
41 |
+
except (KeyError, TypeError):
|
42 |
+
continue
|
43 |
+
|
44 |
+
return result_list
|