XufengDuan commited on
Commit
ad27ecb
·
1 Parent(s): 86c17df

update scripts

Browse files
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import logging
2
-
3
  import gradio as gr
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -9,7 +8,6 @@ from main_backend import PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED
9
  from src.backend import sort_queue
10
  from src.envs import EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, RESULTS_REPO
11
  import src.backend.manage_requests as manage_requests
12
-
13
  import socket
14
  import src.display.about as about
15
  from src.display.css_html_js import custom_css
@@ -21,12 +19,11 @@ import os
21
  import datetime
22
  import spacy_transformers
23
  import pprint
 
24
 
25
  pp = pprint.PrettyPrinter(width=80)
26
-
27
  TOKEN = os.environ.get("H4_TOKEN", None)
28
  print("TOKEN", TOKEN)
29
- import src.backend.run_eval_suite as run_eval_suite
30
 
31
  def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
32
  try:
@@ -45,7 +42,8 @@ def init_space():
45
  # sync model_type with open-llm-leaderboard
46
  ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
47
  ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
48
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
 
49
 
50
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, utils.EVAL_COLS)
51
  return original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
 
1
  import logging
 
2
  import gradio as gr
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
 
8
  from src.backend import sort_queue
9
  from src.envs import EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, RESULTS_REPO
10
  import src.backend.manage_requests as manage_requests
 
11
  import socket
12
  import src.display.about as about
13
  from src.display.css_html_js import custom_css
 
19
  import datetime
20
  import spacy_transformers
21
  import pprint
22
+ import src.backend.run_eval_suite as run_eval_suite
23
 
24
  pp = pprint.PrettyPrinter(width=80)
 
25
  TOKEN = os.environ.get("H4_TOKEN", None)
26
  print("TOKEN", TOKEN)
 
27
 
28
  def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
29
  try:
 
42
  # sync model_type with open-llm-leaderboard
43
  ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
44
  ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
45
+
46
+ original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
47
 
48
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, utils.EVAL_COLS)
49
  return original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
src/backend/model_operations.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import time
3
  from datetime import datetime
4
  import logging
5
- from pathlib import Path
6
  import requests
7
  import json
8
 
@@ -135,12 +135,12 @@ class SummaryGenerator:
135
  # prompt = {}
136
  # for index, row in tqdm(df_prompt.iterrows(), total=df_prompt.shape[0]):
137
  # prompt['E' + row['Item']] = row['Prompt']
138
- xls = pd.ExcelFile(dataset)
139
  sheet_names = xls.sheet_names
140
  # sheet_names = df.sheetnames
141
- print(f"Total: {len(sheet_names)}")
142
- print(sheet_names)
143
-
144
  Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1 = [], [], [], [], [] ,[], [], []
145
  exit_outer_loop = False # bad model
146
  for i, sheet_name in enumerate(sheet_names, start=1):
@@ -150,17 +150,17 @@ class SummaryGenerator:
150
  # if i > 2 and i ==1:
151
  # continue
152
  print(i, sheet_name)
153
- df_sheet = pd.read_excel(xls, sheet_name=sheet_name)
154
-
155
  # 假设第一列是'Prompt0',但这里我们使用列名来避免硬编码
156
- if 'Prompt0' in df_sheet.columns:
157
- prompt_column = df_sheet['Prompt0']
158
- else:
159
  # 如果'Prompt0'列不存在,则跳过该工作表或进行其他处理
160
- continue
161
  if i == 3 :
162
- word1_list = df_sheet['Stimuli-2']
163
- word2_list = df_sheet['Stimuli-3']
164
  V2_column = []
165
  for jj in range(len(word1_list)):
166
  V2_column.append(word1_list[jj] + '_' + word2_list[jj])
@@ -175,17 +175,17 @@ class SummaryGenerator:
175
  Item_column = df_sheet["Item"]
176
  Condition_column = df_sheet["Condition"]
177
  Stimuli_1_column = df_sheet["Stimuli-1"]
178
- if 'Stimuli-2' in df_sheet.columns:
179
  Stimuli_2_column = df_sheet["Stimuli-2"]
180
 
181
  for j, prompt_value in enumerate(tqdm(prompt_column, desc=f"Processing {sheet_name}"), start=0):
182
  if exit_outer_loop:
183
  break
184
- ID = 'E' + str(i)
185
  # q_ID = ID + '_' + str(j)
186
-
187
  # print(ID, q_ID, prompt_value)
188
- system_prompt = envs.SYSTEM_PROMPT
189
  _user_prompt = prompt_value
190
  for ii in range(10):
191
  # user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
@@ -193,7 +193,7 @@ class SummaryGenerator:
193
  try:
194
  '''调用'''
195
  print(ID,'-',ii)
196
-
197
  _response = self.generate_summary(system_prompt, _user_prompt)
198
  # print(f"Finish index {index}")
199
  break
@@ -212,17 +212,24 @@ class SummaryGenerator:
212
  print(f"Quota has reached, wait for {wait_time}")
213
  time.sleep(wait_time)
214
  else:
215
- print(f"Error at index {i}: {e}")
216
- wait_time = 3600
217
- time.sleep(wait_time)
218
- try:
219
- _response = self.generate_summary(system_prompt, _user_prompt)
220
- break
221
- except Exception as e:
222
- exceptions.append(e)
223
  print(f"Error at index {i}: {e}")
224
- exit_outer_loop = True # 设置标志变量为True,准备退出最外层循环
225
- break # 跳出当前的 while 循环
 
 
 
 
 
 
 
 
 
 
226
 
227
  if exit_outer_loop:
228
  break
@@ -272,9 +279,9 @@ class SummaryGenerator:
272
  Experiment_ID.append(ID)
273
  Questions_ID.append(q_column[j])
274
  User_prompt.append(_user_prompt)
275
-
276
  Response.append(_response2)
277
-
278
  Factor_2.append(V2_column[j])
279
  Stimuli_1.append(Stimuli_2_column[j])
280
  Item_ID.append(Item_column[j])
@@ -286,18 +293,18 @@ class SummaryGenerator:
286
  User_prompt.append(_user_prompt)
287
  Response.append(_response1)
288
 
289
-
290
-
291
  Factor_2.append(V2_column[j])
292
  Stimuli_1.append(Stimuli_1_column[j])
293
  Item_ID.append(Item_column[j])
294
  Condition.append(Condition_column[j])
295
-
296
  else:
297
  Experiment_ID.append(ID)
298
  Questions_ID.append(q_column[j])
299
  User_prompt.append(_user_prompt)
300
-
301
  Response.append(_response)
302
  if i == 6:
303
  Factor_2.append(Condition_column[j])
@@ -309,7 +316,7 @@ class SummaryGenerator:
309
  Condition.append(Condition_column[j])
310
  print(_response)
311
 
312
-
313
  # exit()
314
 
315
  # Sleep to prevent hitting rate limits too frequently
@@ -322,14 +329,14 @@ class SummaryGenerator:
322
  print(f'Save summaries to {save_path}')
323
  fpath = Path(save_path)
324
  fpath.parent.mkdir(parents=True, exist_ok=True)
325
- self.summaries_df.to_csv(fpath)
326
 
327
  self.exceptions = exceptions
328
  # self._compute_avg_length()
329
  # self._compute_answer_rate()
330
 
331
  return self.summaries_df
332
-
333
  def generate_summary(self, system_prompt: str, user_prompt: str):
334
  # Using Together AI API
335
  using_together_api = False
@@ -388,28 +395,115 @@ class SummaryGenerator:
388
  result = ''
389
  print(result)
390
  return result
 
 
 
 
 
 
 
 
 
 
 
391
 
392
- # Using OpenAI API
393
- elif 'gpt' in self.model_id.lower():
394
- response = litellm.completion(
395
- model=self.model_id.replace('openai/',''),
396
- messages=[{"role": "system", "content": system_prompt},
397
- {"role": "user", "content": user_prompt}],
398
- # temperature=0.0,
399
- max_tokens=50,
400
- api_key = os.getenv('OpenAI_key')
401
- )
402
- result = response['choices'][0]['message']['content']
403
- # print()
404
- print(result)
 
 
 
 
 
 
405
  return result
406
-
407
- # Using Google AI API for Gemini models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  elif 'gemini' in self.model_id.lower():
409
  genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
410
  generation_config = {
411
  "temperature": 0,
412
- "top_p": 0.95, # cannot change
413
  "top_k": 0,
414
  "max_output_tokens": 50,
415
  # "response_mime_type": "application/json",
@@ -432,101 +526,35 @@ class SummaryGenerator:
432
  "threshold": "BLOCK_NONE"
433
  },
434
  ]
435
- model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest" if "gemini-1.5-pro" in self.model_id.lower() else self.model_id.lower().split('google/')[-1],
436
- generation_config=generation_config,
437
- system_instruction=system_prompt,
438
- safety_settings=safety_settings)
 
 
439
  convo = model.start_chat(history=[])
440
  convo.send_message(user_prompt)
441
  # print(convo.last)
442
  result = convo.last.text
443
  print(result)
444
  return result
445
-
446
- # Using HF API or download checkpoints
447
- elif self.local_model is None:
448
- # print(self.model_id)
449
- # print(self.api_base)
450
- # mistralai/Mistral-7B-Instruct-v0.1
451
- # https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
452
- try: # try use HuggingFace API
453
- # response = litellm.completion(
454
- # model="huggingface/"+'command-r-plus' if 'command' in self.model_id else self.model_id,
455
- # messages=[{"role": "system", "content": system_prompt},
456
- # {"role": "user", "content": user_prompt}],
457
- # temperature=0.0,
458
- # max_tokens=1024,
459
- # api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
460
- # )
461
- # self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
462
- # response = litellm.completion(
463
- # model="huggingface/" + self.model_id,
464
- # # mistralai/Mistral-7B-Instruct-v0.1",
465
- # messages=[{"role": "system", "content": system_prompt},
466
- # {"role": "user", "content": user_prompt}],
467
- # #temperature=0.0,
468
- # max_tokens=1024,
469
- # api_base="https://api-inference.huggingface.co/models/" + self.model_id)
470
- # print("模型返回结果",response)
471
- # print("模型返回结果结束")
472
- # # exit()
473
- # result = response['choices'][0]['message']['content']
474
- # print(result)
475
- from huggingface_hub import InferenceClient
476
- print("token_for_request:",envs.TOKEN)
477
- print(self.model_id)
478
- client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
479
- messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
480
- # outputs = client.chat_completion(messages, max_tokens=50)
481
- result = None
482
- while result is None:
483
- outputs = client.chat_completion(messages, max_tokens=50)
484
- result = outputs['choices'][0]['message']['content']
485
-
486
- if result is None:
487
- time.sleep(1) # Optional: Add a small delay before retrying
488
-
489
- return result
490
- # exit()
491
- except: # fail to call api. run it locally.
492
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
493
- print("Tokenizer loaded")
494
- self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
495
- print("Local model loaded")
496
  # exit()
497
  # Using local model
498
- if self.local_model: # cannot call API. using local model
499
- messages=[
500
- {"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
501
- {"role": "user", "content": user_prompt}
502
- ]
503
- try: # some models support pipeline
504
- pipe = pipeline(
505
- "text-generation",
506
- model=self.local_model,
507
- tokenizer=self.tokenizer,
508
- )
509
-
510
- generation_args = {
511
- "max_new_tokens": 50,
512
- "return_full_text": False,
513
- #"temperature": 0.0,
514
- "do_sample": False,
515
- }
516
 
517
- output = pipe(messages, **generation_args)
518
- result = output[0]['generated_text']
519
- print(result)
520
- except:
521
- prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
522
- print(prompt)
523
- input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
524
- with torch.no_grad():
525
- outputs = self.local_model.generate(**input_ids, max_new_tokens=50, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
526
- result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
527
- result = result.replace(prompt[0], '')
528
- print(result)
529
- return result
530
 
531
  def _compute_avg_length(self):
532
  """
@@ -607,7 +635,7 @@ class EvaluationModel:
607
  for i in range(len(summaries_df["Experiment"])):
608
  # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
609
  # print()
610
- if pd.isna(summaries_df["Response"][i]):
611
  output.append("Other")
612
  continue
613
  rs = summaries_df["Response"][i].strip().lower()
@@ -627,7 +655,7 @@ class EvaluationModel:
627
  output.append("Spiky")
628
  else:
629
  output.append("Other")
630
-
631
 
632
  '''Exp2'''
633
 
@@ -647,12 +675,12 @@ class EvaluationModel:
647
  break
648
  if male == 0 and female == 0 :
649
  output.append("Other")
650
-
651
  '''Exp3'''
652
  elif summaries_df["Experiment"][i] == "E3":
653
  # rs = summaries_df["Response"][i].strip()
654
  print("E3", rs)
655
- if pd.isna(summaries_df["Factor 2"][i]):
656
  output.append("Other")
657
  else:
658
  if summaries_df["Factor 2"][i].strip() == "LS":
@@ -668,9 +696,9 @@ class EvaluationModel:
668
  elif "3" in rs:
669
  output.append("Long")
670
  else:
671
- output.append("Other")
672
  '''Exp4'''
673
-
674
  elif summaries_df["Experiment"][i] == "E4":
675
  # rs = summaries_df["Response"][i].strip()
676
  target = summaries_df["Factor 2"][i].strip().lower()
@@ -704,8 +732,8 @@ class EvaluationModel:
704
  verb = item2verb2[item_id].lower()
705
  sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
706
  print("E5", verb, sentence)
707
-
708
-
709
  doc = nlp1(sentence.replace(" "," "))
710
  # print(doc)
711
  # print()
@@ -745,8 +773,8 @@ class EvaluationModel:
745
 
746
  elif summaries_df["Experiment"][i] == "E6":
747
  sentence = summaries_df["Stimuli 1"][i].strip().lower()
748
- print("E6", sentence)
749
- doc = nlp1(sentence)
750
  subject = "None"
751
  obj = "None"
752
  # 遍历依存关系,寻找主语和宾语
@@ -767,9 +795,9 @@ class EvaluationModel:
767
  output.append("NP")
768
  else:
769
  print(rs, subject, obj, "Other")
770
- output.append("Other")
 
771
 
772
-
773
 
774
 
775
  '''Exp7'''
@@ -786,7 +814,7 @@ class EvaluationModel:
786
  '''Exp8'''
787
  elif summaries_df["Experiment"][i] == "E8":
788
  # rs = summaries_df["Response"][i].strip()
789
-
790
  if "something is wrong with the question" in rs:
791
  output.append("1")
792
  else:
@@ -795,7 +823,7 @@ class EvaluationModel:
795
  '''Exp9'''
796
  elif summaries_df["Experiment"][i] == "E9":
797
  male, female = 0, 0
798
-
799
  # rs = summaries_df["Response"][i].strip()
800
  if "because" in rs:
801
  rs = rs.replace("because because","because").split("because")[1]
@@ -847,8 +875,8 @@ class EvaluationModel:
847
  # '''LLM'''
848
  # self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
849
  # columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
850
- print(self.data.head())
851
-
852
  return self.data
853
  def code_results_llm(self, summaries_df):
854
  '''code results from LLM's response'''
@@ -878,7 +906,7 @@ class EvaluationModel:
878
  for i in range(len(summaries_df["Experiment"])):
879
  # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
880
  # print()
881
- if pd.isna(summaries_df["Response"][i]):
882
  output.append("Other")
883
  continue
884
  rs = summaries_df["Response"][i].strip().lower()
@@ -893,7 +921,7 @@ class EvaluationModel:
893
  output.append("Spiky")
894
  else:
895
  output.append("Other")
896
-
897
 
898
  '''Exp2'''
899
 
@@ -913,13 +941,13 @@ class EvaluationModel:
913
  break
914
  if male == 0 and female == 0 :
915
  output.append("Other")
916
-
917
  '''Exp3'''
918
  elif summaries_df["Experiment"][i] == "E3":
919
  # rs = summaries_df["Response"][i].strip()
920
  print("E3", rs)
921
  rs = rs.replace('"', '')
922
- pair = summaries_df["Factor 2"][i]
923
  word1, word2 = pair.split('_')
924
 
925
  if rs == word1:
@@ -980,8 +1008,8 @@ class EvaluationModel:
980
  verb = item2verb2[item_id].lower()
981
  sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
982
  print("E5", verb, sentence)
983
-
984
-
985
  doc = nlp1(sentence.replace(" "," "))
986
  # print(doc)
987
  # print()
@@ -1021,8 +1049,8 @@ class EvaluationModel:
1021
 
1022
  elif summaries_df["Experiment"][i] == "E6":
1023
  sentence = summaries_df["Stimuli 1"][i].strip().lower()
1024
- print("E6", sentence)
1025
- doc = nlp1(sentence)
1026
  subject = "None"
1027
  obj = "None"
1028
  # 遍历依存关系,寻找主语和宾语
@@ -1043,9 +1071,9 @@ class EvaluationModel:
1043
  output.append("NP")
1044
  else:
1045
  print(rs, subject, obj, "Other")
1046
- output.append("Other")
 
1047
 
1048
-
1049
 
1050
 
1051
  '''Exp7'''
@@ -1072,7 +1100,7 @@ class EvaluationModel:
1072
  '''Exp9'''
1073
  elif summaries_df["Experiment"][i] == "E9":
1074
  male, female = 0, 0
1075
-
1076
  # rs = summaries_df["Response"][i].strip()
1077
  if "because" in rs:
1078
  rs = rs.replace("because because","because").split("because")[1]
@@ -1125,14 +1153,14 @@ class EvaluationModel:
1125
  '''LLM'''
1126
  self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
1127
  columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
1128
- print(self.data.head())
1129
-
1130
  return self.data
1131
-
1132
 
1133
-
1134
-
1135
-
 
1136
 
1137
 
1138
  def calculate_js_divergence(self, file_path_1, file_path_2):
@@ -1225,7 +1253,7 @@ class EvaluationModel:
1225
  print("avg_js_divergence:", avg_js_divergence)
1226
 
1227
  return avg_js_divergence
1228
-
1229
 
1230
  def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
1231
  '''
@@ -1272,19 +1300,19 @@ class EvaluationModel:
1272
 
1273
 
1274
 
1275
-
1276
 
1277
 
1278
 
1279
 
1280
-
1281
-
1282
-
1283
-
1284
 
1285
 
1286
-
1287
-
 
 
 
 
 
1288
 
1289
 
1290
 
 
2
  import time
3
  from datetime import datetime
4
  import logging
5
+ from pathlib import Path
6
  import requests
7
  import json
8
 
 
135
  # prompt = {}
136
  # for index, row in tqdm(df_prompt.iterrows(), total=df_prompt.shape[0]):
137
  # prompt['E' + row['Item']] = row['Prompt']
138
+ xls = pd.ExcelFile(dataset)
139
  sheet_names = xls.sheet_names
140
  # sheet_names = df.sheetnames
141
+ print(f"Total: {len(sheet_names)}")
142
+ print(sheet_names)
143
+
144
  Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1 = [], [], [], [], [] ,[], [], []
145
  exit_outer_loop = False # bad model
146
  for i, sheet_name in enumerate(sheet_names, start=1):
 
150
  # if i > 2 and i ==1:
151
  # continue
152
  print(i, sheet_name)
153
+ df_sheet = pd.read_excel(xls, sheet_name=sheet_name)
154
+
155
  # 假设第一列是'Prompt0',但这里我们使用列名来避免硬编码
156
+ if 'Prompt0' in df_sheet.columns:
157
+ prompt_column = df_sheet['Prompt0']
158
+ else:
159
  # 如果'Prompt0'列不存在,则跳过该工作表或进行其他处理
160
+ continue
161
  if i == 3 :
162
+ word1_list = df_sheet['Stimuli-2']
163
+ word2_list = df_sheet['Stimuli-3']
164
  V2_column = []
165
  for jj in range(len(word1_list)):
166
  V2_column.append(word1_list[jj] + '_' + word2_list[jj])
 
175
  Item_column = df_sheet["Item"]
176
  Condition_column = df_sheet["Condition"]
177
  Stimuli_1_column = df_sheet["Stimuli-1"]
178
+ if 'Stimuli-2' in df_sheet.columns:
179
  Stimuli_2_column = df_sheet["Stimuli-2"]
180
 
181
  for j, prompt_value in enumerate(tqdm(prompt_column, desc=f"Processing {sheet_name}"), start=0):
182
  if exit_outer_loop:
183
  break
184
+ ID = 'E' + str(i)
185
  # q_ID = ID + '_' + str(j)
186
+
187
  # print(ID, q_ID, prompt_value)
188
+ system_prompt = envs.SYSTEM_PROMPT
189
  _user_prompt = prompt_value
190
  for ii in range(10):
191
  # user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
 
193
  try:
194
  '''调用'''
195
  print(ID,'-',ii)
196
+
197
  _response = self.generate_summary(system_prompt, _user_prompt)
198
  # print(f"Finish index {index}")
199
  break
 
212
  print(f"Quota has reached, wait for {wait_time}")
213
  time.sleep(wait_time)
214
  else:
215
+ max_retries = 30
216
+ retries = 0
217
+ wait_time = 120
218
+
219
+ while retries < max_retries:
 
 
 
220
  print(f"Error at index {i}: {e}")
221
+ time.sleep(wait_time)
222
+ try:
223
+ _response = self.generate_summary(system_prompt, _user_prompt)
224
+ break
225
+ except Exception as e:
226
+ exceptions.append(e)
227
+ retries += 1
228
+ print(f"Retry {retries}/{max_retries} failed at index {i}: {e}")
229
+ if retries >= max_retries:
230
+ exit_outer_loop = True
231
+ break
232
+
233
 
234
  if exit_outer_loop:
235
  break
 
279
  Experiment_ID.append(ID)
280
  Questions_ID.append(q_column[j])
281
  User_prompt.append(_user_prompt)
282
+
283
  Response.append(_response2)
284
+
285
  Factor_2.append(V2_column[j])
286
  Stimuli_1.append(Stimuli_2_column[j])
287
  Item_ID.append(Item_column[j])
 
293
  User_prompt.append(_user_prompt)
294
  Response.append(_response1)
295
 
296
+
297
+
298
  Factor_2.append(V2_column[j])
299
  Stimuli_1.append(Stimuli_1_column[j])
300
  Item_ID.append(Item_column[j])
301
  Condition.append(Condition_column[j])
302
+
303
  else:
304
  Experiment_ID.append(ID)
305
  Questions_ID.append(q_column[j])
306
  User_prompt.append(_user_prompt)
307
+
308
  Response.append(_response)
309
  if i == 6:
310
  Factor_2.append(Condition_column[j])
 
316
  Condition.append(Condition_column[j])
317
  print(_response)
318
 
319
+
320
  # exit()
321
 
322
  # Sleep to prevent hitting rate limits too frequently
 
329
  print(f'Save summaries to {save_path}')
330
  fpath = Path(save_path)
331
  fpath.parent.mkdir(parents=True, exist_ok=True)
332
+ self.summaries_df.to_csv(fpath)
333
 
334
  self.exceptions = exceptions
335
  # self._compute_avg_length()
336
  # self._compute_answer_rate()
337
 
338
  return self.summaries_df
339
+
340
  def generate_summary(self, system_prompt: str, user_prompt: str):
341
  # Using Together AI API
342
  using_together_api = False
 
395
  result = ''
396
  print(result)
397
  return result
398
+ if self.local_model: # cannot call API. using local model
399
+ messages=[
400
+ {"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
401
+ {"role": "user", "content": user_prompt}
402
+ ]
403
+ try: # some models support pipeline
404
+ pipe = pipeline(
405
+ "text-generation",
406
+ model=self.local_model,
407
+ tokenizer=self.tokenizer,
408
+ )
409
 
410
+ generation_args = {
411
+ "max_new_tokens": 50,
412
+ "return_full_text": False,
413
+ #"temperature": 0.0,
414
+ "do_sample": False,
415
+ }
416
+
417
+ output = pipe(messages, **generation_args)
418
+ result = output[0]['generated_text']
419
+ print(result)
420
+ except:
421
+ prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
422
+ print(prompt)
423
+ input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
424
+ with torch.no_grad():
425
+ outputs = self.local_model.generate(**input_ids, max_new_tokens=50, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
426
+ result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
427
+ result = result.replace(prompt[0], '')
428
+ print(result)
429
  return result
430
+
431
+
432
+ elif self.local_model is None:
433
+ # print(self.model_id)
434
+ # print(self.api_base)
435
+ # mistralai/Mistral-7B-Instruct-v0.1
436
+ # https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
437
+ # Using HF API or download checkpoints
438
+ try: # try use HuggingFace API
439
+ from huggingface_hub import InferenceClient
440
+ print("token_for_request:",envs.TOKEN)
441
+ print(self.model_id)
442
+ client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
443
+ messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
444
+ # outputs = client.chat_completion(messages, max_tokens=50)
445
+ result = None
446
+ while result is None:
447
+ outputs = client.chat_completion(messages, max_tokens=50)
448
+ result = outputs['choices'][0]['message']['content']
449
+
450
+ if result is None:
451
+ time.sleep(1) # Optional: Add a small delay before retrying
452
+
453
+ return result
454
+
455
+ except Exception as e:
456
+ print(f"Error with TOKEN: {envs.TOKEN}, trying with TOKEN1")
457
+ try:
458
+ client = InferenceClient(self.model_id, api_key=envs.TOKEN1, headers={"X-use-cache": "false"})
459
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
460
+ result = None
461
+ while result is None:
462
+ outputs = client.chat_completion(messages, max_tokens=50)
463
+ result = outputs['choices'][0]['message']['content']
464
+
465
+ if result is None:
466
+ time.sleep(1) # Optional: Add a small delay before retrying
467
+
468
+ return result
469
+ except Exception as e:
470
+ print(f"Error with TOKEN1: {envs.TOKEN1}")
471
+ raise e
472
+
473
+ # except: # fail to call api. run it locally.
474
+ # self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
475
+ # print("Tokenizer loaded")
476
+ # self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
477
+ # print("Local model loaded")
478
+ # response = litellm.completion(
479
+ # model="huggingface/"+'command-r-plus' if 'command' in self.model_id else self.model_id,
480
+ # messages=[{"role": "system", "content": system_prompt},
481
+ # {"role": "user", "content": user_prompt}],
482
+ # temperature=0.0,
483
+ # max_tokens=1024,
484
+ # api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
485
+ # )
486
+ # self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
487
+ # response = litellm.completion(
488
+ # model="huggingface/" + self.model_id,
489
+ # # mistralai/Mistral-7B-Instruct-v0.1",
490
+ # messages=[{"role": "system", "content": system_prompt},
491
+ # {"role": "user", "content": user_prompt}],
492
+ # #temperature=0.0,
493
+ # max_tokens=1024,
494
+ # api_base="https://api-inference.huggingface.co/models/" + self.model_id)
495
+ # print("模型返回结果",response)
496
+ # print("模型返回结果结束")
497
+ # # exit()
498
+ # result = response['choices'][0]['message']['content']
499
+ # print(result)
500
+ # exit()
501
+ # Using Google AI API for Gemini models
502
  elif 'gemini' in self.model_id.lower():
503
  genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
504
  generation_config = {
505
  "temperature": 0,
506
+ "top_p": 0.95, # cannot change
507
  "top_k": 0,
508
  "max_output_tokens": 50,
509
  # "response_mime_type": "application/json",
 
526
  "threshold": "BLOCK_NONE"
527
  },
528
  ]
529
+ model = genai.GenerativeModel(
530
+ model_name="gemini-1.5-pro-latest" if "gemini-1.5-pro" in self.model_id.lower() else
531
+ self.model_id.lower().split('google/')[-1],
532
+ generation_config=generation_config,
533
+ system_instruction=system_prompt,
534
+ safety_settings=safety_settings)
535
  convo = model.start_chat(history=[])
536
  convo.send_message(user_prompt)
537
  # print(convo.last)
538
  result = convo.last.text
539
  print(result)
540
  return result
541
+ # Using OpenAI API
542
+ elif 'gpt' in self.model_id.lower():
543
+ response = litellm.completion(
544
+ model=self.model_id.replace('openai/',''),
545
+ messages=[{"role": "system", "content": system_prompt},
546
+ {"role": "user", "content": user_prompt}],
547
+ # temperature=0.0,
548
+ max_tokens=50,
549
+ api_key = os.getenv('OpenAI_key')
550
+ )
551
+ result = response['choices'][0]['message']['content']
552
+ # print()
553
+ print(result)
554
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  # exit()
556
  # Using local model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
 
559
  def _compute_avg_length(self):
560
  """
 
635
  for i in range(len(summaries_df["Experiment"])):
636
  # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
637
  # print()
638
+ if pd.isna(summaries_df["Response"][i]):
639
  output.append("Other")
640
  continue
641
  rs = summaries_df["Response"][i].strip().lower()
 
655
  output.append("Spiky")
656
  else:
657
  output.append("Other")
658
+
659
 
660
  '''Exp2'''
661
 
 
675
  break
676
  if male == 0 and female == 0 :
677
  output.append("Other")
678
+
679
  '''Exp3'''
680
  elif summaries_df["Experiment"][i] == "E3":
681
  # rs = summaries_df["Response"][i].strip()
682
  print("E3", rs)
683
+ if pd.isna(summaries_df["Factor 2"][i]):
684
  output.append("Other")
685
  else:
686
  if summaries_df["Factor 2"][i].strip() == "LS":
 
696
  elif "3" in rs:
697
  output.append("Long")
698
  else:
699
+ output.append("Other")
700
  '''Exp4'''
701
+
702
  elif summaries_df["Experiment"][i] == "E4":
703
  # rs = summaries_df["Response"][i].strip()
704
  target = summaries_df["Factor 2"][i].strip().lower()
 
732
  verb = item2verb2[item_id].lower()
733
  sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
734
  print("E5", verb, sentence)
735
+
736
+
737
  doc = nlp1(sentence.replace(" "," "))
738
  # print(doc)
739
  # print()
 
773
 
774
  elif summaries_df["Experiment"][i] == "E6":
775
  sentence = summaries_df["Stimuli 1"][i].strip().lower()
776
+ print("E6", sentence)
777
+ doc = nlp1(sentence)
778
  subject = "None"
779
  obj = "None"
780
  # 遍历依存关系,寻找主语和宾语
 
795
  output.append("NP")
796
  else:
797
  print(rs, subject, obj, "Other")
798
+ output.append("Other")
799
+
800
 
 
801
 
802
 
803
  '''Exp7'''
 
814
  '''Exp8'''
815
  elif summaries_df["Experiment"][i] == "E8":
816
  # rs = summaries_df["Response"][i].strip()
817
+
818
  if "something is wrong with the question" in rs:
819
  output.append("1")
820
  else:
 
823
  '''Exp9'''
824
  elif summaries_df["Experiment"][i] == "E9":
825
  male, female = 0, 0
826
+
827
  # rs = summaries_df["Response"][i].strip()
828
  if "because" in rs:
829
  rs = rs.replace("because because","because").split("because")[1]
 
875
  # '''LLM'''
876
  # self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
877
  # columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
878
+ print(self.data.head())
879
+
880
  return self.data
881
  def code_results_llm(self, summaries_df):
882
  '''code results from LLM's response'''
 
906
  for i in range(len(summaries_df["Experiment"])):
907
  # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
908
  # print()
909
+ if pd.isna(summaries_df["Response"][i]):
910
  output.append("Other")
911
  continue
912
  rs = summaries_df["Response"][i].strip().lower()
 
921
  output.append("Spiky")
922
  else:
923
  output.append("Other")
924
+
925
 
926
  '''Exp2'''
927
 
 
941
  break
942
  if male == 0 and female == 0 :
943
  output.append("Other")
944
+
945
  '''Exp3'''
946
  elif summaries_df["Experiment"][i] == "E3":
947
  # rs = summaries_df["Response"][i].strip()
948
  print("E3", rs)
949
  rs = rs.replace('"', '')
950
+ pair = summaries_df["Factor 2"][i]
951
  word1, word2 = pair.split('_')
952
 
953
  if rs == word1:
 
1008
  verb = item2verb2[item_id].lower()
1009
  sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
1010
  print("E5", verb, sentence)
1011
+
1012
+
1013
  doc = nlp1(sentence.replace(" "," "))
1014
  # print(doc)
1015
  # print()
 
1049
 
1050
  elif summaries_df["Experiment"][i] == "E6":
1051
  sentence = summaries_df["Stimuli 1"][i].strip().lower()
1052
+ print("E6", sentence)
1053
+ doc = nlp1(sentence)
1054
  subject = "None"
1055
  obj = "None"
1056
  # 遍历依存关系,寻找主语和宾语
 
1071
  output.append("NP")
1072
  else:
1073
  print(rs, subject, obj, "Other")
1074
+ output.append("Other")
1075
+
1076
 
 
1077
 
1078
 
1079
  '''Exp7'''
 
1100
  '''Exp9'''
1101
  elif summaries_df["Experiment"][i] == "E9":
1102
  male, female = 0, 0
1103
+
1104
  # rs = summaries_df["Response"][i].strip()
1105
  if "because" in rs:
1106
  rs = rs.replace("because because","because").split("because")[1]
 
1153
  '''LLM'''
1154
  self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
1155
  columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
1156
+ print(self.data.head())
1157
+
1158
  return self.data
 
1159
 
1160
+
1161
+
1162
+
1163
+
1164
 
1165
 
1166
  def calculate_js_divergence(self, file_path_1, file_path_2):
 
1253
  print("avg_js_divergence:", avg_js_divergence)
1254
 
1255
  return avg_js_divergence
1256
+
1257
 
1258
  def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
1259
  '''
 
1300
 
1301
 
1302
 
 
1303
 
1304
 
1305
 
1306
 
 
 
 
 
1307
 
1308
 
1309
+
1310
+
1311
+
1312
+
1313
+
1314
+
1315
+
1316
 
1317
 
1318
 
src/display/about.py CHANGED
@@ -33,15 +33,13 @@ An improved version (HHEM v2) is integrated into the [Vectara platform](https://
33
  LLM_BENCHMARKS_TEXT = """
34
  ## Introduction
35
 
36
- The Hughes Hallucination Evaluation Model (HHEM) Leaderboard is dedicated to assessing the frequency of hallucinations in document summaries generated by Large Language Models (LLMs).
37
-
38
- Hallucinations refer to instances where a model introduces factually incorrect or unrelated content in its summaries.
39
 
40
  ## How it works
41
 
42
- Using [Vectara](https://vectara.com)'s HHEM, we measure the occurrence of hallucinations in generated summaries.
43
- Given a source document and a summary generated by an LLM, HHEM outputs a hallucination score between 0 and 1, with 0 indicating complete hallucination and 1 representing perfect factual consistency.
44
- The model card for HHEM can be found [here](https://huggingface.co/vectara/hallucination_evaluation_model).
45
 
46
  ## Evaluation Dataset
47
 
@@ -49,10 +47,8 @@ Our evaluation dataset consists of 1006 documents from multiple public datasets,
49
  We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
50
 
51
  ## Metrics Explained
52
- - Hallucination Rate: Percentage of summaries with a hallucination score below 0.5
53
- - Factual Consistency Rate: The complement of the hallucination rate, expressed as a percentage.
54
- - Answer Rate: Percentage of summaries that are non-empty. This is either the model refuses to generate a response or throws an error due to various reasons. (e.g. the model believes that the document includes inappropriate content)
55
- - Average Summary Length: The average word count of generated summaries
56
 
57
  ## Note on non-Hugging Face models
58
  On HHEM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
@@ -61,18 +57,13 @@ If you would like to submit your model that is not available on the Hugging Face
61
  ## Model Submissions and Reproducibility
62
  You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
63
 
64
- ### For models not available on the Hugging Face model hub:
65
- 1) Access generated summaries used for evaluation [here](https://github.com/vectara/hallucination-leaderboard) in "leaderboard_summaries.csv".
66
- 2) The text generation prompt is available under "Prompt Used" section in the repository's README.
67
- 3) Details on API Integration for evaluations are under "API Integration Details".
68
-
69
  ### For models available on the Hugging Face model hub:
70
  To replicate the evaluation result for a Hugging Face model:
71
 
72
  1) Clone the Repository
73
  ```python
74
  git lfs install
75
- git clone https://huggingface.co/spaces/vectara/leaderboard
76
  ```
77
  2) Install the Requirements
78
  ```python
 
33
  LLM_BENCHMARKS_TEXT = """
34
  ## Introduction
35
 
36
+ This study aims to compare the similarities between human and model responses in language use by employing ten psycholinguistic tasks. Each task consists of multiple stimuli, with each stimulus having both expected and unexpected responses.
37
+ To quantify the similarity, we collected responses from 2000 human participants, creating a binomial distribution for each stimulus within each task. The same stimuli were then presented to a language model, generating another binomial distribution for comparison.
 
38
 
39
  ## How it works
40
 
41
+ To measure the similarity between human and model responses, we utilize the Jensen-Shannon (JS) divergence. This method allows us to compare the two binomial distributions (one from human responses and one from model responses) for each stimulus.
42
+ The similarity is quantified by calculating 1 minus the JS divergence, where a value closer to 1 indicates higher similarity.
 
43
 
44
  ## Evaluation Dataset
45
 
 
47
  We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
48
 
49
  ## Metrics Explained
50
+ - Individual Task Similarity: For each psycholinguistic task, we calculate the humanlike score for each stimulus, providing a measure of how closely the model’s responses resemble those of humans.
51
+ - Average Similarity: The average of the humanlike scores across all stimuli and tasks, giving an overall indication of the model’s performance in mimicking human language use.
 
 
52
 
53
  ## Note on non-Hugging Face models
54
  On HHEM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
 
57
  ## Model Submissions and Reproducibility
58
  You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
59
 
 
 
 
 
 
60
  ### For models available on the Hugging Face model hub:
61
  To replicate the evaluation result for a Hugging Face model:
62
 
63
  1) Clone the Repository
64
  ```python
65
  git lfs install
66
+ git clone https://huggingface.co/spaces/Simondon/HumanLikeness
67
  ```
68
  2) Install the Requirements
69
  ```python
src/display/formatting.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  from datetime import datetime, timezone
 
3
 
4
  from huggingface_hub import HfApi
5
  from huggingface_hub.hf_api import ModelInfo
@@ -29,7 +30,8 @@ def styled_message(message):
29
 
30
 
31
  def has_no_nan_values(df, columns):
32
- return df[columns].notna().all(axis=1)
 
33
 
34
 
35
  def has_nan_values(df, columns):
 
1
  import os
2
  from datetime import datetime, timezone
3
+ import numpy as np
4
 
5
  from huggingface_hub import HfApi
6
  from huggingface_hub.hf_api import ModelInfo
 
30
 
31
 
32
  def has_no_nan_values(df, columns):
33
+ return df.iloc[:, 2].apply(lambda x: not any(np.isnan(val) for val in x))
34
+
35
 
36
 
37
  def has_nan_values(df, columns):
src/envs.py CHANGED
@@ -6,12 +6,8 @@ from huggingface_hub import HfApi
6
  # replace this with our token
7
  # TOKEN = os.environ.get("HF_TOKEN", None)
8
  TOKEN = os.getenv("H4_TOKEN")
9
- print("token:", TOKEN)
10
- # print(TOKEN)
11
- # OWNER = "vectara"
12
- # REPO_ID = f"{OWNER}/Humanlike"
13
- # QUEUE_REPO = f"{OWNER}/requests"
14
- # RESULTS_REPO = f"{OWNER}/results"
15
 
16
 
17
  OWNER = "Simondon" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
6
  # replace this with our token
7
  # TOKEN = os.environ.get("HF_TOKEN", None)
8
  TOKEN = os.getenv("H4_TOKEN")
9
+ TOKEN1 = os.getenv("H4_TOKEN1")
10
+ # print("H4_token:", TOKEN)
 
 
 
 
11
 
12
 
13
  OWNER = "Simondon" # Change to your org - don't forget to create a results and request dataset, with the correct format!
src/leaderboard/read_evals.py CHANGED
@@ -155,23 +155,11 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
155
  model_result_filepaths = []
156
  print("results_path", results_path)
157
  for root, _, files in os.walk(results_path):
158
- # We should only have json files in model results
159
  print("file",files)
160
-
161
- # if not files or any([not f.endswith(".json") for f in files]):
162
-
163
- # continue
164
  for f in files:
165
  if f.endswith(".json"):
166
-
167
- # Sort the files by date
168
- # try:
169
- # files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
170
- # except dateutil.parser._parser.ParserError:
171
- # files = [files[-1]]
172
-
173
  model_result_filepaths.extend([os.path.join(root, f)])
174
- print("model_result_filepaths", model_result_filepaths)
175
  # exit()
176
  eval_results = {}
177
  for model_result_filepath in model_result_filepaths:
 
155
  model_result_filepaths = []
156
  print("results_path", results_path)
157
  for root, _, files in os.walk(results_path):
 
158
  print("file",files)
 
 
 
 
159
  for f in files:
160
  if f.endswith(".json"):
 
 
 
 
 
 
 
161
  model_result_filepaths.extend([os.path.join(root, f)])
162
+ print("model_result_filepaths:", model_result_filepaths)
163
  # exit()
164
  eval_results = {}
165
  for model_result_filepath in model_result_filepaths:
src/populate.py CHANGED
@@ -11,19 +11,19 @@ import src.leaderboard.read_evals as read_evals
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  print(results_path, requests_path)
13
  raw_data = read_evals.get_raw_eval_results(results_path, requests_path)
14
- print("raw_data:",raw_data)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
 
17
- print(all_data_json)
18
  df = pd.DataFrame.from_records(all_data_json)
19
- print(df)
20
  # exit()
21
  df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
22
  df = df[cols].round(decimals=2)
23
 
24
  # filter out if any of the benchmarks have not been produced
25
  df = df[formatting.has_no_nan_values(df, benchmark_cols)]
26
- return raw_data, df
27
 
28
 
29
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  print(results_path, requests_path)
13
  raw_data = read_evals.get_raw_eval_results(results_path, requests_path)
14
+ #print("raw_data:",raw_data)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
 
17
+ #print(all_data_json)
18
  df = pd.DataFrame.from_records(all_data_json)
19
+ print("all results:",df)
20
  # exit()
21
  df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
22
  df = df[cols].round(decimals=2)
23
 
24
  # filter out if any of the benchmarks have not been produced
25
  df = df[formatting.has_no_nan_values(df, benchmark_cols)]
26
+ return df
27
 
28
 
29
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: