eduagarcia commited on
Commit
e9177b9
1 Parent(s): 7bdbf7b

Fix bugs, fix datasets path, added test functions

Browse files
Files changed (2) hide show
  1. functions.py +24 -19
  2. openllm.py +7 -1
functions.py CHANGED
@@ -7,6 +7,7 @@ from pytablewriter import MarkdownTableWriter
7
  import gradio as gr
8
  from openllm import get_json_format_data, get_datas
9
  import pandas as pd
 
10
 
11
  BOT_HF_TOKEN = os.getenv('BOT_HF_TOKEN')
12
 
@@ -23,7 +24,7 @@ If you encounter any issues, please report them to https://huggingface.co/spaces
23
  """
24
 
25
  def search(df, value):
26
- result_df = df[df["Model"] == value]
27
  return result_df.iloc[0].to_dict() if not result_df.empty else None
28
 
29
 
@@ -39,8 +40,8 @@ def get_query_url(repo):
39
  def get_task_summary(results):
40
  return {
41
  "ENEM":
42
- {"dataset_type":"enem_challenge",
43
- "dataset_name":"ENEM Challenge",
44
  "metric_type":"acc",
45
  "metric_value":results["ENEM"],
46
  "dataset_config": None,
@@ -50,8 +51,8 @@ def get_task_summary(results):
50
  "metric_name":"accuracy"
51
  },
52
  "BLUEX":
53
- {"dataset_type":"bluex",
54
- "dataset_name":"BLUEX",
55
  "metric_type":"acc",
56
  "metric_value":results["BLUEX"],
57
  "dataset_config": None,
@@ -61,7 +62,7 @@ def get_task_summary(results):
61
  "metric_name":"accuracy"
62
  },
63
  "OAB Exams":
64
- {"dataset_type":"oab_exams",
65
  "dataset_name":"OAB Exams",
66
  "metric_type":"acc",
67
  "metric_value":results["OAB Exams"],
@@ -72,8 +73,8 @@ def get_task_summary(results):
72
  "metric_name":"accuracy"
73
  },
74
  "ASSIN2 RTE":
75
- {"dataset_type":"assin2_rte",
76
- "dataset_name":"ASSIN2 RTE",
77
  "metric_type":"f1_macro",
78
  "metric_value":results["ASSIN2 RTE"],
79
  "dataset_config": None,
@@ -83,8 +84,8 @@ def get_task_summary(results):
83
  "metric_name":"f1-macro"
84
  },
85
  "ASSIN2 STS":
86
- {"dataset_type":"assin2_sts",
87
- "dataset_name":"ASSIN2 STS",
88
  "metric_type":"pearson",
89
  "metric_value":results["ASSIN2 STS"],
90
  "dataset_config": None,
@@ -94,8 +95,8 @@ def get_task_summary(results):
94
  "metric_name":"pearson"
95
  },
96
  "FAQUAD NLI":
97
- {"dataset_type":"fquad_nli",
98
- "dataset_name":"FAQUAD NLI",
99
  "metric_type":"f1_macro",
100
  "metric_value":results["FAQUAD NLI"],
101
  "dataset_config": None,
@@ -105,8 +106,8 @@ def get_task_summary(results):
105
  "metric_name":"f1-macro"
106
  },
107
  "HateBR":
108
- {"dataset_type":"hatebr_offensive",
109
- "dataset_name":"HateBR",
110
  "metric_type":"f1_macro",
111
  "metric_value":results["HateBR"],
112
  "dataset_config": None,
@@ -116,8 +117,8 @@ def get_task_summary(results):
116
  "metric_name":"f1-macro"
117
  },
118
  "PT Hate Speech":
119
- {"dataset_type":"portuguese_hate_speech",
120
- "dataset_name":"PT Hate Speech",
121
  "metric_type":"f1_macro",
122
  "metric_value":results["PT Hate Speech"],
123
  "dataset_config": None,
@@ -127,7 +128,7 @@ def get_task_summary(results):
127
  "metric_name":"f1-macro"
128
  },
129
  "tweetSentBR":
130
- {"dataset_type":"tweetsentbr",
131
  "dataset_name":"tweetSentBR",
132
  "metric_type":"f1_macro",
133
  "metric_value":results["tweetSentBR"],
@@ -146,7 +147,7 @@ def get_eval_results(repo):
146
  task_summary = get_task_summary(results)
147
  md_writer = MarkdownTableWriter()
148
  md_writer.headers = ["Metric", "Value"]
149
- md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
150
 
151
  text = f"""
152
  # [Open Portuguese LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
@@ -201,6 +202,7 @@ def commit(repo, pr_number=None, message="Adding Evaluation Results", oauth_toke
201
  if "Repo card metadata block was not found." in str(e): # There is no readme
202
  readme_text = get_edited_yaml_readme(repo, token=token)
203
  else:
 
204
  print(f"Something went wrong: {e}")
205
 
206
  liste = [CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=readme_text.encode())]
@@ -217,4 +219,7 @@ def commit(repo, pr_number=None, message="Adding Evaluation Results", oauth_toke
217
  elif "Repository Not Found" in str(e):
218
  return "Repository Not Found"
219
  else:
220
- return e
 
 
 
 
7
  import gradio as gr
8
  from openllm import get_json_format_data, get_datas
9
  import pandas as pd
10
+ import traceback
11
 
12
  BOT_HF_TOKEN = os.getenv('BOT_HF_TOKEN')
13
 
 
24
  """
25
 
26
  def search(df, value):
27
+ result_df = df[df["Model Name"] == value]
28
  return result_df.iloc[0].to_dict() if not result_df.empty else None
29
 
30
 
 
40
  def get_task_summary(results):
41
  return {
42
  "ENEM":
43
+ {"dataset_type":"eduagarcia/enem_challenge",
44
+ "dataset_name":"ENEM Challenge (No Images)",
45
  "metric_type":"acc",
46
  "metric_value":results["ENEM"],
47
  "dataset_config": None,
 
51
  "metric_name":"accuracy"
52
  },
53
  "BLUEX":
54
+ {"dataset_type":"eduagarcia-temp/BLUEX_without_images",
55
+ "dataset_name":"BLUEX (No Images)",
56
  "metric_type":"acc",
57
  "metric_value":results["BLUEX"],
58
  "dataset_config": None,
 
62
  "metric_name":"accuracy"
63
  },
64
  "OAB Exams":
65
+ {"dataset_type":"eduagarcia/oab_exams",
66
  "dataset_name":"OAB Exams",
67
  "metric_type":"acc",
68
  "metric_value":results["OAB Exams"],
 
73
  "metric_name":"accuracy"
74
  },
75
  "ASSIN2 RTE":
76
+ {"dataset_type":"assin2",
77
+ "dataset_name":"Assin2 RTE",
78
  "metric_type":"f1_macro",
79
  "metric_value":results["ASSIN2 RTE"],
80
  "dataset_config": None,
 
84
  "metric_name":"f1-macro"
85
  },
86
  "ASSIN2 STS":
87
+ {"dataset_type":"assin2",
88
+ "dataset_name":"Assin2 STS",
89
  "metric_type":"pearson",
90
  "metric_value":results["ASSIN2 STS"],
91
  "dataset_config": None,
 
95
  "metric_name":"pearson"
96
  },
97
  "FAQUAD NLI":
98
+ {"dataset_type":"ruanchaves/faquad-nli",
99
+ "dataset_name":"FaQuAD NLI",
100
  "metric_type":"f1_macro",
101
  "metric_value":results["FAQUAD NLI"],
102
  "dataset_config": None,
 
106
  "metric_name":"f1-macro"
107
  },
108
  "HateBR":
109
+ {"dataset_type":"eduagarcia/portuguese_benchmark",
110
+ "dataset_name":"HateBR Binary",
111
  "metric_type":"f1_macro",
112
  "metric_value":results["HateBR"],
113
  "dataset_config": None,
 
117
  "metric_name":"f1-macro"
118
  },
119
  "PT Hate Speech":
120
+ {"dataset_type":"eduagarcia/portuguese_benchmark",
121
+ "dataset_name":"PT Hate Speech Binary",
122
  "metric_type":"f1_macro",
123
  "metric_value":results["PT Hate Speech"],
124
  "dataset_config": None,
 
128
  "metric_name":"f1-macro"
129
  },
130
  "tweetSentBR":
131
+ {"dataset_type":"eduagarcia-temp/tweetsentbr",
132
  "dataset_name":"tweetSentBR",
133
  "metric_type":"f1_macro",
134
  "metric_value":results["tweetSentBR"],
 
147
  task_summary = get_task_summary(results)
148
  md_writer = MarkdownTableWriter()
149
  md_writer.headers = ["Metric", "Value"]
150
+ md_writer.value_matrix = [["Average", f"**{results['Average ⬆️']}**"]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
151
 
152
  text = f"""
153
  # [Open Portuguese LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
 
202
  if "Repo card metadata block was not found." in str(e): # There is no readme
203
  readme_text = get_edited_yaml_readme(repo, token=token)
204
  else:
205
+ traceback.print_exc()
206
  print(f"Something went wrong: {e}")
207
 
208
  liste = [CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=readme_text.encode())]
 
219
  elif "Repository Not Found" in str(e):
220
  return "Repository Not Found"
221
  else:
222
+ return e
223
+
224
+ if __name__ == "__main__":
225
+ print(get_eval_results("Qwen/Qwen1.5-72B-Chat"))
openllm.py CHANGED
@@ -41,4 +41,10 @@ def get_datas(data):
41
  except (KeyError, TypeError):
42
  continue
43
 
44
- return result_list
 
 
 
 
 
 
 
41
  except (KeyError, TypeError):
42
  continue
43
 
44
+ return result_list
45
+
46
+ if __name__ == "__main__":
47
+ data = get_json_format_data()
48
+ print(data)
49
+ finished_models = get_datas(data)
50
+ print(finished_models)