XufengDuan commited on
Commit
d24f6e8
1 Parent(s): c0bd00c

update scripts

Browse files
src/backend/evaluate_model.py CHANGED
@@ -5,7 +5,7 @@ import csv
5
 
6
  import src.envs as envs
7
 
8
- from src.backend.model_operations import SummaryGenerator, EvaluationModel
9
  import src.backend.util as util
10
 
11
  logging.basicConfig(level=logging.INFO,
@@ -26,7 +26,7 @@ class Evaluator:
26
  limit (int): Limit on the number of items to process.
27
  write_out (bool): Whether to write results to a file.
28
  output_base_path (str): Base path for output files.
29
- summary_generator (SummaryGenerator): Instance for generating summaries.
30
  eval_model (EvaluationModel): Instance for evaluating summaries.
31
  """
32
  def __init__(self, model, revision, precision, batch_size,
@@ -56,8 +56,8 @@ class Evaluator:
56
  self.write_out = write_out
57
  self.output_base_path = output_base_path
58
  try:
59
- self.summary_generator = SummaryGenerator(model, revision)
60
- self.eval_model = EvaluationModel(envs.HEM_PATH)
61
  except Exception as e:
62
  logging.error(f"Error initializing Evaluator: {e}")
63
  raise
@@ -81,10 +81,10 @@ class Evaluator:
81
  # print(envs.DATASET_PATH)
82
  # print(df.shape)
83
  # print(df.iloc[-1])
84
- self.generated_summaries_df = self.summary_generator.generate_summaries(envs.DATASET_PATH, df_prompt, save_path=f"./generation_results/{self.model}.csv")
85
  # exit()
86
- # avg_summary_len = self.summary_generator.avg_length
87
- # answer_rate = self.summary_generator.answer_rate
88
  envs.API.upload_file(
89
  path_or_fileobj=f"./generation_results/{self.model}.csv",
90
  path_in_repo=f"{self.model}.csv",
@@ -93,7 +93,7 @@ class Evaluator:
93
  )
94
 
95
  '''开始评估模型的结果'''
96
- self.humanlike = self.eval_model.evaluate_humanlike(self.generated_summaries_df, envs.HUMAN_DATA, f"./generation_results/{self.model}.csv")
97
 
98
  all_results = self.humanlike
99
  # Prepare individual experiment scores and CIs
@@ -111,23 +111,6 @@ class Evaluator:
111
  overall_ci=all_results['overall']['confidence_interval'],
112
  **experiment_results # Unpack the experiment results
113
  )
114
-
115
- '''原始指标'''
116
-
117
- # self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
118
- # self.generated_summaries_df)
119
- # factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
120
- # hallucination_rate = self.eval_model.hallucination_rate
121
- # factual_consistency_rate = 0
122
- # answer_rate = 0
123
- # avg_summary_len = 0
124
- #
125
- # results = util.format_results(model_name=self.model, revision=self.revision,
126
- # precision=self.precision,
127
- # factual_consistency_rate=factual_consistency_rate,
128
- # hallucination_rate=self.humanlike,
129
- # answer_rate=answer_rate,
130
- # avg_summary_len=avg_summary_len)
131
  return results
132
  except FileNotFoundError:
133
  logging.error(f"File not found: {envs.DATASET_PATH}")
@@ -145,28 +128,28 @@ class Evaluator:
145
  logging.error(f"Need to first download the results from google drive to the learderboard folder")
146
  raise
147
 
148
- source_summary_df = self.generated_summaries_df[["user_prompt", "response"]]
149
 
150
- # #update leaderboard_summaries.csv
151
  # #first remove previous results for the current model
152
- # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8', sep="\t")
153
  # mask = existing_df['model'] == self.model
154
  # existing_df = existing_df[~mask]
155
  # # get new result
156
- leaderboard_summaries_df = source_summary_df
157
- leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0])
158
- leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
159
- print('leaderboard_summaries.csv has been updated')
160
 
161
- # update leaderboard_summaries_with_scores.csv
162
  # BUG: get error when opening the file
163
- # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
164
  # encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2)
165
  # print(existing_df.shape)
166
  # mask = existing_df['model'] == self.model
167
  # existing_df = existing_df[~mask]
168
  # get new result
169
- leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
170
- leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
171
- leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
172
- print('leaderboard_summaries_with_scores.csv has been updated')
 
5
 
6
  import src.envs as envs
7
 
8
+ from src.backend.model_operations import ResponseGenerator, EvaluationModel
9
  import src.backend.util as util
10
 
11
  logging.basicConfig(level=logging.INFO,
 
26
  limit (int): Limit on the number of items to process.
27
  write_out (bool): Whether to write results to a file.
28
  output_base_path (str): Base path for output files.
29
+ response_generator (ResponseGenerator): Instance for generating summaries.
30
  eval_model (EvaluationModel): Instance for evaluating summaries.
31
  """
32
  def __init__(self, model, revision, precision, batch_size,
 
56
  self.write_out = write_out
57
  self.output_base_path = output_base_path
58
  try:
59
+ self.response_generator = ResponseGenerator(model, revision)
60
+ self.eval_model = EvaluationModel()
61
  except Exception as e:
62
  logging.error(f"Error initializing Evaluator: {e}")
63
  raise
 
81
  # print(envs.DATASET_PATH)
82
  # print(df.shape)
83
  # print(df.iloc[-1])
84
+ self.generated_responses_df = self.response_generator.generate_response(envs.DATASET_PATH, df_prompt, save_path=f"./generation_results/{self.model}.csv")
85
  # exit()
86
+ # avg_response_len = self.response_generator.avg_length
87
+ # answer_rate = self.response_generator.answer_rate
88
  envs.API.upload_file(
89
  path_or_fileobj=f"./generation_results/{self.model}.csv",
90
  path_in_repo=f"{self.model}.csv",
 
93
  )
94
 
95
  '''开始评估模型的结果'''
96
+ self.humanlike = self.eval_model.evaluate_humanlike(self.generated_responses_df, envs.HUMAN_DATA, f"./generation_results/{self.model}.csv")
97
 
98
  all_results = self.humanlike
99
  # Prepare individual experiment scores and CIs
 
111
  overall_ci=all_results['overall']['confidence_interval'],
112
  **experiment_results # Unpack the experiment results
113
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  return results
115
  except FileNotFoundError:
116
  logging.error(f"File not found: {envs.DATASET_PATH}")
 
128
  logging.error(f"Need to first download the results from google drive to the learderboard folder")
129
  raise
130
 
131
+ source_response_df = self.generated_responses_df[["user_prompt", "response"]]
132
 
133
+ # #update leaderboard_responses.csv
134
  # #first remove previous results for the current model
135
+ # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_responses.csv'), encoding='utf-8', sep="\t")
136
  # mask = existing_df['model'] == self.model
137
  # existing_df = existing_df[~mask]
138
  # # get new result
139
+ leaderboard_responses_df = source_response_df
140
+ leaderboard_responses_df.insert(2, "model", [self.model]*leaderboard_responses_df.shape[0])
141
+ leaderboard_responses_df.to_csv(os.path.join(working_path, 'leaderboard_responses.csv'), mode='a', index=False, header=False)
142
+ print('leaderboard_responses.csv has been updated')
143
 
144
+ # update leaderboard_responses_with_scores.csv
145
  # BUG: get error when opening the file
146
+ # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_responses_with_scores.csv'),
147
  # encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2)
148
  # print(existing_df.shape)
149
  # mask = existing_df['model'] == self.model
150
  # existing_df = existing_df[~mask]
151
  # get new result
152
+ leaderboard_responses_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
153
+ leaderboard_responses_with_scores_df.insert(3, "model", [self.model]*leaderboard_responses_with_scores_df.shape[0])
154
+ leaderboard_responses_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_responses_with_scores.csv'), mode='a', index=False, header=False)
155
+ print('leaderboard_responses_with_scores.csv has been updated')
src/backend/model_operations.py CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
6
  import requests
7
  import json
8
 
9
- import numpy as np
10
  import pandas as pd
11
  import spacy
12
  from sentence_transformers import CrossEncoder
@@ -43,7 +43,7 @@ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=Tr
43
  # Load spacy model for word tokenization
44
  # nlp = spacy.load("en_core_web_sm")
45
  try:
46
- nlp1 = spacy.load("en_core_web_sm")
47
  except OSError:
48
  print("无法加载模型,继续执行其他处理。")
49
 
@@ -55,22 +55,6 @@ logging.basicConfig(level=logging.INFO,
55
 
56
 
57
 
58
- # os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
59
-
60
- def load_evaluation_model(model_path):
61
- """Load the evaluation model from the given path
62
-
63
- Args:
64
- model_path (str): Path to the evaluation model
65
-
66
- Returns:
67
- CrossEncoder: The evaluation model
68
- """
69
- # model = CrossEncoder(model_path)
70
- model = ""
71
- return model
72
-
73
-
74
  class ModelLoadingException(Exception):
75
  """Exception raised for errors in loading a model.
76
 
@@ -85,21 +69,21 @@ class ModelLoadingException(Exception):
85
  super().__init__(f"{messages} id={model_id} revision={revision}")
86
 
87
 
88
- class SummaryGenerator:
89
- """A class to generate summaries using a causal language model.
90
 
91
  Attributes:
92
  model (str): huggingface/{model_id}
93
  api_base (str): https://api-inference.huggingface.co/models/{model_id}
94
- summaries_df (DataFrame): DataFrame to store generated summaries.
95
  revision (str): Model revision.
96
- avg_length (float): Average length of summaries.
97
- answer_rate (float): Rate of non-empty summaries.
98
  """
99
 
100
  def __init__(self, model_id, revision):
101
  """
102
- Initializes the SummaryGenerator with a model.
103
 
104
  Args:
105
  model_id (str): Identifier for the model.
@@ -108,29 +92,28 @@ class SummaryGenerator:
108
  self.model_id = model_id
109
  self.model = f"huggingface/{model_id}"
110
  self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
111
- self.summaries_df = pd.DataFrame()
112
  self.revision = revision
113
  self.avg_length = None
114
  self.answer_rate = None
115
  self.exceptions = None
116
  self.local_model = None
117
 
118
- def generate_summaries(self, dataset, df_prompt, save_path=None):
119
- """Generate summaries for a given DataFrame of source docs.
120
- 修改这里拉取模型生成结果
121
  Args:
122
  dataset (DataFrame): DataFrame containing source docs.
123
 
124
  Returns:
125
- summaries_df (DataFrame): Generated summaries by the model.
126
  """
127
  exceptions = []
128
  if (save_path is not None) and os.path.exists(save_path):
129
  '''已存在文件,可以读取已经存在的测试文本'''
130
- self.summaries_df = pd.read_csv(save_path)
131
- # print(self.summaries_df['Experiment'])
132
 
133
- print(f'Loaded generated summaries from {save_path}')
134
  else:
135
  '''测试文件不存在,则需要调用指定的模型来进行测试'''
136
  # prompt = {}
@@ -193,9 +176,9 @@ class SummaryGenerator:
193
  while True:
194
  try:
195
  '''调用'''
196
- print(ID,'-',j,'-',ii)
197
 
198
- _response = self.generate_summary(system_prompt, _user_prompt)
199
  # print(f"Finish index {index}")
200
  break
201
  except Exception as e:
@@ -221,7 +204,7 @@ class SummaryGenerator:
221
  print(f"Error at index {i}: {e}")
222
  time.sleep(wait_time)
223
  try:
224
- _response = self.generate_summary(system_prompt, _user_prompt)
225
  break
226
  except Exception as ee:
227
  exceptions.append(ee)
@@ -236,45 +219,46 @@ class SummaryGenerator:
236
  break
237
  if i == 5:
238
  #print(_response)
 
 
 
 
 
 
239
 
240
  def extract_responses(text, trigger_words=None):
241
  if trigger_words is None:
242
  trigger_words = ["sure", "okay", "yes"]
243
 
244
  try:
 
245
  sentences = text.split('\n')
246
-
247
  sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
248
-
249
  sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence for
250
  sentence in sentences]
251
- if any(sentences[0].lower().startswith(word) for word in trigger_words) and len(sentences)>2:
 
 
 
 
252
  _response1 = sentences[1].strip() if len(sentences) > 1 else None
253
  _response2 = sentences[2].strip() if len(sentences) > 2 else None
254
  else:
255
  _response1 = sentences[0].strip() if len(sentences) > 0 else None
256
  _response2 = sentences[1].strip() if len(sentences) > 1 else None
257
 
 
258
  except Exception as e:
259
  print(f"Error occurred: {e}")
260
  _response1, _response2 = None, None
261
 
 
262
 
263
  return _response1, _response2
264
 
265
  _response1, _response2 = extract_responses(_response)
266
- # if _response == None:
267
- # _response1, _response2 = "", ""
268
- # else:
269
- # try:
270
- # import re
271
- # _response1,_response2 = re.split(r'\n\s*\n', _response.strip())
272
- # except:
273
- # _response1 = _response.split('\n\n')
274
- # if len(_response) == 2:
275
- # _response1, _response2 = _response[0], _response[1]
276
- # else:
277
- # _response1, _response2 = _response[0], ""
278
 
279
  Experiment_ID.append(ID)
280
  Questions_ID.append(q_column[j])
@@ -309,30 +293,26 @@ class SummaryGenerator:
309
  Stimuli_1.append(Stimuli_1_column[j])
310
  Item_ID.append(Item_column[j])
311
  Condition.append(Condition_column[j])
312
- #print(_response)
313
-
314
-
315
- # exit()
316
 
317
  # Sleep to prevent hitting rate limits too frequently
318
  time.sleep(1)
319
 
320
- self.summaries_df = pd.DataFrame(list(zip(Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1)),
321
- columns=["Experiment", "Question_ID", "Item", "Condition", "User_prompt", "Response","Factor 2","Stimuli 1"])
322
 
323
  if save_path is not None:
324
- print(f'Save summaries to {save_path}')
325
  fpath = Path(save_path)
326
  fpath.parent.mkdir(parents=True, exist_ok=True)
327
- self.summaries_df.to_csv(fpath)
328
 
329
  self.exceptions = exceptions
330
  # self._compute_avg_length()
331
  # self._compute_answer_rate()
332
 
333
- return self.summaries_df
334
 
335
- def generate_summary(self, system_prompt: str, user_prompt: str):
336
  # Using Together AI API
337
  using_together_api = False
338
  together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm']
@@ -354,17 +334,9 @@ class SummaryGenerator:
354
  "model": self.model_id,
355
  # "max_tokens": 4096,
356
  'max_new_tokens': 100,
357
- # "temperature": 0.0,
358
  # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
359
  }
360
- # if 'mixtral' in self.model_id.lower():
361
- # # payload['prompt'] = user_prompt
362
- # # payload['prompt'] = "Write a summary of the following passage:\nPassage:\n" + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
363
- # payload['prompt'] = 'You must stick to the passage provided. Provide a concise summary of the following passage, covering the core pieces of information described:\nPassage:\n' + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
364
- # print(payload)
365
- # else:
366
- # payload['messages'] = [{"role": "system", "content": system_prompt},
367
- # {"role": "user", "content": user_prompt}]
368
  payload['messages'] = [{"role": "system", "content": system_prompt},
369
  {"role": "user", "content": user_prompt}]
370
  headers = {
@@ -462,82 +434,13 @@ class SummaryGenerator:
462
  continue
463
 
464
  raise Exception("All tokens failed.")
465
- # print(self.model_id)
466
- # print(self.api_base)
467
- # mistralai/Mistral-7B-Instruct-v0.1
468
- # https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
469
- # Using HF API or download checkpoints
470
- # try: # try use HuggingFace API
471
- # from huggingface_hub import InferenceClient
472
- # print("token_for_request:",envs.TOKEN)
473
- # print(self.model_id)
474
- # client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
475
- # messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
476
- # # outputs = client.chat_completion(messages, max_tokens=100)
477
- # result = None
478
- # while result is None:
479
- # outputs = client.chat_completion(messages, max_tokens=100)
480
- # result = outputs['choices'][0]['message']['content']
481
- #
482
- # if result is None:
483
- # time.sleep(1) # Optional: Add a small delay before retrying
484
- #
485
- # return result
486
- #
487
- # except Exception as e:
488
- # print(f"Error with TOKEN: {envs.TOKEN}, trying with TOKEN1")
489
- # try:
490
- # client = InferenceClient(self.model_id, api_key=envs.TOKEN1, headers={"X-use-cache": "false"})
491
- # messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
492
- # result = None
493
- # while result is None:
494
- # outputs = client.chat_completion(messages, max_tokens=100)
495
- # result = outputs['choices'][0]['message']['content']
496
- #
497
- # if result is None:
498
- # time.sleep(1) # Optional: Add a small delay before retrying
499
- #
500
- # return result
501
- # except Exception as ee:
502
- # print(f"Error with TOKEN1: {envs.TOKEN1}")
503
- # raise ee
504
-
505
-
506
- # except: # fail to call api. run it locally.
507
- # self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
508
- # print("Tokenizer loaded")
509
- # self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
510
- # print("Local model loaded")
511
- # response = litellm.completion(
512
- # model="huggingface/"+'command-r-plus' if 'command' in self.model_id else self.model_id,
513
- # messages=[{"role": "system", "content": system_prompt},
514
- # {"role": "user", "content": user_prompt}],
515
- # temperature=0.0,
516
- # max_tokens=1024,
517
- # api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
518
- # )
519
- # self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
520
- # response = litellm.completion(
521
- # model="huggingface/" + self.model_id,
522
- # # mistralai/Mistral-7B-Instruct-v0.1",
523
- # messages=[{"role": "system", "content": system_prompt},
524
- # {"role": "user", "content": user_prompt}],
525
- # #temperature=0.0,
526
- # max_tokens=1024,
527
- # api_base="https://api-inference.huggingface.co/models/" + self.model_id)
528
- # print("模型返回结果",response)
529
- # print("模型返回结果结束")
530
- # # exit()
531
- # result = response['choices'][0]['message']['content']
532
- # print(result)
533
- # exit()
534
- # Using Google AI API for Gemini models
535
  elif 'gemini' in self.model_id.lower():
536
  genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
537
  generation_config = {
538
- "temperature": 0,
539
- "top_p": 0.95, # cannot change
540
- "top_k": 0,
541
  "max_output_tokens": 100,
542
  # "response_mime_type": "application/json",
543
  }
@@ -589,58 +492,28 @@ class SummaryGenerator:
589
  # Using local model
590
 
591
 
592
- def _compute_avg_length(self):
593
- """
594
- Compute the average length of non-empty summaries using SpaCy.
595
- """
596
- total_word_count = 0
597
- total_count = 0
598
-
599
- for summary in self.summaries_df['summary']:
600
- if util.is_summary_valid(summary):
601
- doc = nlp1(summary)
602
- words = [token.text for token in doc if token.is_alpha]
603
- total_word_count += len(words)
604
- total_count += 1
605
-
606
- self.avg_length = 0 if total_count == 0 else total_word_count / total_count
607
-
608
- def _compute_answer_rate(self):
609
- """
610
- Compute the rate of non-empty summaries.
611
- """
612
- valid_count = sum(1 for summary in self.summaries_df['summary']
613
- if util.is_summary_valid(summary))
614
-
615
- total_count = len(self.summaries_df)
616
-
617
- self.answer_rate = 0 if total_count == 0 else valid_count / total_count
618
 
619
 
620
  class EvaluationModel:
621
- """A class to evaluate generated summaries.
622
 
623
  Attributes:
624
  model (CrossEncoder): The evaluation model.
625
- scores (list): List of evaluation scores.
626
- accuracy (float): Accuracy of the summaries.
627
- hallucination_rate (float): Rate of hallucination in summaries.
 
628
  """
629
 
630
  def __init__(self, model_path):
631
  """
632
- Initializes the EvaluationModel with a CrossEncoder model.
633
-
634
- Args:
635
- model_path (str): Path to the CrossEncoder model.
636
  """
637
- self.model = load_evaluation_model(model_path)
638
  self.scores = []
639
- self.factual_consistency_rate = None
640
- self.hallucination_rate = None
641
  self.humanlike_score = None
642
 
643
- def code_results(self, summaries_df):
644
  '''code results from LLM's response'''
645
  output = []
646
  '''database for Exp4'''
@@ -661,28 +534,27 @@ class EvaluationModel:
661
  Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
662
  Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
663
 
664
-
665
  male_keyword = ["he", "his", "himself"]
666
  female_keyword = ["she", "her", "herself"]
667
- print(len(summaries_df["Experiment"]))
668
- for i in range(len(summaries_df["Experiment"])):
 
669
  # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
670
  # print()
671
- if pd.isna(summaries_df["Response"][i]):
672
  output.append("Other")
673
  continue
674
- rs = summaries_df["Response"][i].strip().lower()
675
- sentences = rs.split('\n')
676
- sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence
677
- for sentence in sentences]
678
- rs = [sentence.strip() for sentence in sentences if sentence.strip()]
679
- rs = '\n'.join(rs)
680
- rs = rs.replace("[", '').replace("]", '')
 
681
  '''Exp1'''
682
- # period and comma will affect the result
683
- if summaries_df["Experiment"][i] == "E1":
684
- print("E1", rs)
685
- rs = rs.replace('"','')
686
  if rs == "round":
687
  # vote_1_1 += 1
688
  output.append("Round")
@@ -691,13 +563,12 @@ class EvaluationModel:
691
  else:
692
  output.append("Other")
693
 
694
-
695
  '''Exp2'''
696
- # not the first pronoun
697
- elif summaries_df["Experiment"][i] == "E2":
698
- # rs = summaries_df["Response"][i].strip()
699
  rs = rs.split(' ')
700
- print("E2", rs)
701
  male, female = 0, 0
702
  for word in rs:
703
  if word in female_keyword and male == 0:
@@ -708,323 +579,63 @@ class EvaluationModel:
708
  male = 1
709
  output.append("Male")
710
  break
711
- if male == 0 and female == 0 :
712
  output.append("Other")
713
 
714
  '''Exp3'''
715
- #
716
- elif summaries_df["Experiment"][i] == "E3":
717
- # rs = summaries_df["Response"][i].strip()
718
- print("E3", rs)
719
- if pd.isna(summaries_df["Factor 2"][i]):
720
- output.append("Other")
721
- else:
722
- if summaries_df["Factor 2"][i].strip() == "LS":
723
- if "2" in rs:
724
- output.append("Long")
725
- elif "3" in rs:
726
- output.append("Short")
727
- else:
728
- output.append("Other")
729
- if summaries_df["Factor 2"][i].strip() == "SL":
730
- if "2" in rs:
731
- output.append("Short")
732
- elif "3" in rs:
733
- output.append("Long")
734
- else:
735
- output.append("Other")
736
- '''Exp4'''
737
-
738
- elif summaries_df["Experiment"][i] == "E4":
739
- # rs = summaries_df["Response"][i].strip()
740
- target = summaries_df["Factor 2"][i].strip().lower()
741
- pair = target + "_" + rs
742
- print("E4:", pair)
743
- if pair in wordpair2code.keys():
744
- output.append(wordpair2code[pair])
745
- else:
746
- output.append("Other")
747
-
748
- '''Exp5'''
749
- elif summaries_df["Experiment"][i] == "E5" or summaries_df["Experiment"][i] == "E51":
750
- # sentence = summaries_df["Response"][i].strip()
751
- item_id = summaries_df["Item"][i]
752
- question_id = summaries_df["Question_ID"][i]
753
-
754
- sti1, sti2 = "", ""
755
-
756
- if summaries_df["Experiment"][i] == "E51":
757
- sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
758
- sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
759
- verb = item2verb1[item_id].lower()
760
 
761
- sentence = sti1 + " " + rs.replace(sti1, "")
762
- print("E5", verb, sentence)
763
- if summaries_df["Experiment"][i] == "E5":
764
- sti1 = Stimuli1[question_id].lower().replace("...", "")
765
- # print(sti1)
766
- sti2 = Stimuli2[question_id].lower().replace("...", "")
767
-
768
- verb = item2verb2[item_id].lower()
769
- sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
770
- print("E5", verb, sentence)
771
-
772
-
773
- doc = nlp1(sentence.replace(" "," "))
774
- # print(doc)
775
- # print()
776
- verb_token = None
777
- for token in doc:
778
- # print(token.lemma_)
779
- if token.lemma_ == verb:
780
- verb_token = token
781
- break
782
- # exit()
783
- if verb_token is None:
784
  output.append("Other")
785
- print("E5 The target verb is missing from the sentence.")
786
  else:
787
- pobj, dative = None, None
788
- # print(verb_token.children)
789
- # exit()
790
- for child in verb_token.children:
791
- print(child)
792
- if (child.dep_ == 'dative' and child.pos_ == "ADP") or (child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
793
- pobj = child.text
794
- if child.dep_ == 'dative':
795
- dative = child.text
796
- print("E5", pobj, dative)
797
- # exit()
798
-
799
- if pobj:
800
- output.append("PO")
801
- elif dative:
802
- output.append("DO")
803
- else:
804
- print("Other", sentence, pobj, dative)
805
- # exit()
806
  output.append("Other")
807
-
808
- '''Exp6'''
809
-
810
- elif summaries_df["Experiment"][i] == "E6":
811
- sentence = summaries_df["Stimuli 1"][i].strip().lower()
812
- print("E6", sentence)
813
- doc = nlp1(sentence)
814
- subject = "None"
815
- obj = "None"
816
- # 遍历依存关系,寻找主语和宾语
817
- for token in doc:
818
- if token.dep_ == "nsubj":
819
- subject = token.text
820
- elif token.dep_ == "dobj":
821
- obj = token.text
822
- print("E6", subject, obj)
823
- if subject in rs and obj in rs:
824
- print(rs, subject, obj, "Other")
825
- output.append("Other")
826
- elif subject in rs:
827
- print(rs, subject, obj, "VP")
828
- output.append("VP")
829
- elif obj in rs:
830
- print(rs, subject, obj, "NP")
831
- output.append("NP")
832
- else:
833
- print(rs, subject, obj, "Other")
834
- output.append("Other")
835
-
836
-
837
-
838
-
839
- '''Exp7'''
840
- elif summaries_df["Experiment"][i] == "E7":
841
- # rs = summaries_df["Response"][i].strip().lower()
842
- print("E7",rs)
843
- if rs == "no":
844
- output.append("0")
845
- elif rs == "yes":
846
- output.append("1")
847
- else:
848
- output.append("Other")
849
-
850
- '''Exp8'''
851
- elif summaries_df["Experiment"][i] == "E8":
852
- # rs = summaries_df["Response"][i].strip()
853
-
854
- if "something is wrong with the question" in rs:
855
- output.append("1")
856
- else:
857
- output.append("0")
858
-
859
- '''Exp9'''
860
- elif summaries_df["Experiment"][i] == "E9":
861
- male, female = 0, 0
862
-
863
- # rs = summaries_df["Response"][i].strip()
864
- if "because" in rs:
865
- rs = rs.replace("because because","because").split("because")[1]
866
- else:
867
- rs = rs
868
- condition = summaries_df["Factor 2"][i].strip()
869
- rs = rs.split(" ")
870
- for w in rs:
871
- if w in male_keyword and female != 1:
872
- male = 1
873
- break
874
- if w in female_keyword and male != 1:
875
- female = 1
876
- break
877
- print("E9", "condition", condition, "male", male, "female", female)
878
- if male == 0 and female == 0:
879
- output.append('Other')
880
- else:
881
- if male == 1 and female==0:
882
- if condition == "MF":
883
- output.append("Subject")
884
- elif condition == "FM":
885
- output.append("Object")
886
- else:
887
- output.append("Other")
888
- elif female == 1 and male ==0:
889
- if condition == "MF":
890
- output.append("Object")
891
- elif condition == "FM":
892
- output.append("Subject")
893
  else:
894
  output.append("Other")
895
 
896
- '''Exp10'''
897
- elif summaries_df["Experiment"][i] == "E10":
898
- # rs = summaries_df["Response"][i].strip()
899
- if rs == "yes":
900
- output.append("1")
901
- else:
902
- output.append("0")
903
- else:
904
- print("can;t find the Exp:", summaries_df["Experiment"][i])
905
- output.append("NA")
906
- # print(output)
907
- # exit()
908
- '''human'''
909
- self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], summaries_df["Coding"], output)),
910
- columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
911
- # '''LLM'''
912
- # self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
913
- # columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
914
- print(self.data.head())
915
-
916
- return self.data
917
- def code_results_llm(self, summaries_df):
918
- '''code results from LLM's response'''
919
- output = []
920
- '''database for Exp4'''
921
- item4 = pd.read_csv(envs.ITEM_4_DATA)
922
- wordpair2code = {}
923
- for j in range(len(item4['Coding'])):
924
- wordpair2code[item4['Pair'][j]] = item4['Coding'][j]
925
- '''verb for Exp5'''
926
- item5 = pd.read_csv(envs.ITEM_5_DATA)
927
- # item corresponding to verb, same item id corresponding to verb pair
928
- item2verb2 = {}
929
- item2verb1 = {}
930
-
931
- Stimuli1, Stimuli2 = {}, {}
932
- for j in range(len(item5['Item'])):
933
- item2verb1[item5['Item'][j]] = item5['Verb1'][j]
934
- item2verb2[item5['Item'][j]] = item5['Verb2'][j]
935
- Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
936
- Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
937
-
938
-
939
- male_keyword = ["he", "his", "himself"]
940
- female_keyword = ["she", "her", "herself"]
941
- print(len(summaries_df["Experiment"]))
942
- for i in range(len(summaries_df["Experiment"])):
943
- # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
944
- # print()
945
- # data cleaning
946
- if pd.isna(summaries_df["Response"][i]):
947
- output.append("Other")
948
- continue
949
- rs = summaries_df["Response"][i].strip().lower()
950
- sentences = rs.split('\n')
951
- sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence
952
- for sentence in sentences]
953
- rs = [sentence.strip() for sentence in sentences if sentence.strip()]
954
- rs = '\n'.join(rs)
955
- rs = rs.replace('[', '').replace(']','').replace('.','')
956
- '''Exp1'''
957
- # the period and comma will affect the result
958
- if summaries_df["Experiment"][i] == "E1":
959
- print("E1", rs)
960
- rs = rs.replace('"', '') # Remove any unnecessary quotation marks
961
- rs_cleaned = rs.replace(',', '') # Remove periods and commas
962
-
963
- # Use 'contains' instead of 'equals' for keyword matching to avoid issues caused by punctuation
964
- if "round" in rs_cleaned:
965
- output.append("Round")
966
- elif "spiky" in rs_cleaned:
967
- output.append("Spiky")
968
- else:
969
- output.append("Other")
970
-
971
-
972
- '''Exp2'''
973
-
974
- elif summaries_df["Experiment"][i] == "E2":
975
- rs = rs.split(' ')
976
- print("E2", rs)
977
- male, female = 0, 0
978
- for word in rs:
979
- if word in female_keyword and male == 0:
980
- female = 1
981
- output.append("Female")
982
- break
983
- if word in male_keyword and female == 0:
984
- male = 1
985
- output.append("Male")
986
- break
987
- if male == 0 and female == 0 :
988
- output.append("Other")
989
-
990
- '''Exp3'''
991
- elif summaries_df["Experiment"][i] == "E3":
992
- # rs = summaries_df["Response"][i].strip()
993
- print("E3", rs)
994
- rs = rs.replace('"', '').lower().replace(".","")
995
- pair = summaries_df["Factor 2"][i]
996
- word1, word2 = pair.split('_')
997
-
998
- if rs == word1:
999
- if len(word1) > len(word2):
1000
- output.append("Long")
1001
- else:
1002
- output.append("Short")
1003
- elif rs == word2:
1004
- if len(word1) > len(word2):
1005
- output.append("Short")
1006
- else:
1007
- output.append("Long")
1008
- else:
1009
- output.append("Other")
1010
-
1011
  '''Exp4'''
1012
 
1013
- elif summaries_df["Experiment"][i] == "E4":
 
 
 
 
 
 
 
1014
  try:
1015
  meaning_word = rs.split(";")[4].replace(" ", '')
1016
  except IndexError:
1017
- output.append("Other")
1018
- continue
 
 
 
1019
  except Exception as e:
1020
  print(f"Unexpected error: {e}")
1021
  output.append("Other")
1022
  continue
1023
- meaning_word = meaning_word.replace('.', '')
1024
- meaning_word = meaning_word.replace(';', '')
1025
- target = summaries_df["Factor 2"][i].strip().lower()
1026
  pair = target + "_" + meaning_word
1027
- print("E4:", pair)
1028
 
1029
  if pair in wordpair2code.keys():
1030
  output.append(wordpair2code[pair])
@@ -1032,31 +643,30 @@ class EvaluationModel:
1032
  output.append("Other")
1033
 
1034
  '''Exp5'''
1035
- elif summaries_df["Experiment"][i] == "E5" or summaries_df["Experiment"][i] == "E51":
1036
- # sentence = summaries_df["Response"][i].strip()
1037
- item_id = summaries_df["Item"][i]
1038
- question_id = summaries_df["Question_ID"][i]
1039
 
1040
  sti1, sti2 = "", ""
1041
 
1042
- if summaries_df["Experiment"][i] == "E51":
1043
  sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
1044
- sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
1045
  verb = item2verb1[item_id].lower()
1046
 
1047
  sentence = sti1 + " " + rs.replace(sti1, "")
1048
- print("E5", verb, sentence)
1049
- if summaries_df["Experiment"][i] == "E5":
1050
- sti1 = Stimuli1[question_id].lower().replace("...", "")
1051
- # print(sti1)
1052
  sti2 = Stimuli2[question_id].lower().replace("...", "")
1053
 
1054
  verb = item2verb2[item_id].lower()
1055
- sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
1056
- print("E5", verb, sentence)
1057
 
1058
-
1059
- doc = nlp1(sentence.replace(" "," "))
1060
  # print(doc)
1061
  # print()
1062
  verb_token = None
@@ -1066,102 +676,94 @@ class EvaluationModel:
1066
  verb_token = token
1067
  break
1068
  # exit()
1069
- if verb_token is None:
1070
- output.append("Other")
1071
- print("E5 The target verb is missing from the sentence.")
1072
- else:
1073
- pobj, dative = None, None
1074
- # print(verb_token.children)
1075
- # exit()
1076
  for child in verb_token.children:
1077
- print(child)
1078
- if (child.dep_ == 'dative' and child.pos_ == "ADP") or (child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
 
1079
  pobj = child.text
1080
  if child.dep_ == 'dative':
1081
  dative = child.text
1082
- print("E5", pobj, dative)
 
 
 
 
 
 
 
 
 
1083
  # exit()
 
 
1084
 
1085
- if pobj:
1086
- output.append("PO")
1087
- elif dative:
1088
- output.append("DO")
1089
- else:
1090
- print("Other", sentence, pobj, dative)
1091
- # exit()
1092
- output.append("Other")
1093
 
1094
  '''Exp6'''
1095
 
1096
- elif summaries_df["Experiment"][i] == "E6":
1097
- sentence = summaries_df["Stimuli 1"][i].strip().lower()
1098
- print("E6", sentence)
1099
  doc = nlp1(sentence)
1100
  subject = "None"
1101
  obj = "None"
 
 
1102
  for token in doc:
1103
  if token.dep_ == "nsubj":
1104
  subject = token.text
1105
  elif token.dep_ == "dobj":
1106
  obj = token.text
1107
- print("E6", subject, obj)
1108
  if subject in rs and obj in rs:
1109
- print(rs, subject, obj, "Other")
1110
  output.append("Other")
1111
  elif subject in rs:
1112
- print(rs, subject, obj, "VP")
1113
  output.append("VP")
1114
  elif obj in rs:
1115
- print(rs, subject, obj, "NP")
1116
  output.append("NP")
1117
  else:
1118
- print(rs, subject, obj, "Other")
1119
  output.append("Other")
1120
 
1121
-
1122
-
1123
-
1124
  '''Exp7'''
1125
- elif summaries_df["Experiment"][i] == "E7":
1126
- # Remove periods and commas, then convert to lowercase
1127
  rs = rs.replace(".", "").replace(",", "").lower()
1128
- print("E7", rs)
1129
-
1130
- # Split the response into words
1131
- words = rs.split(' ')
1132
- found = False
1133
-
1134
- for word in words:
1135
- if word == "no":
1136
- output.append("0")
1137
- found = True
1138
- break
1139
- elif word == "yes":
1140
- output.append("1")
1141
- found = True
1142
- break
1143
- if not found:
1144
  output.append("Other")
1145
 
1146
  '''Exp8'''
1147
- elif summaries_df["Experiment"][i] == "E8":
1148
- # rs = summaries_df["Response"][i].strip()
1149
- print("E8",rs)
1150
  if "something is wrong with the question" in rs:
1151
  output.append("1")
1152
  else:
1153
  output.append("0")
1154
 
1155
  '''Exp9'''
1156
- elif summaries_df["Experiment"][i] == "E9":
1157
  male, female = 0, 0
1158
 
1159
- # rs = summaries_df["Response"][i].strip()
1160
  if "because" in rs:
1161
- rs = rs.replace("because because","because").split("because")[1]
1162
  else:
1163
  rs = rs
1164
- condition = summaries_df["Factor 2"][i].strip()
1165
  rs = rs.split(" ")
1166
  for w in rs:
1167
  if w in male_keyword and female != 1:
@@ -1170,18 +772,18 @@ class EvaluationModel:
1170
  if w in female_keyword and male != 1:
1171
  female = 1
1172
  break
1173
- print("E9", "condition", condition, "male", male, "female", female)
1174
- if male == 0 and female == 0:
1175
  output.append('Other')
1176
  else:
1177
- if male == 1 and female==0:
1178
  if condition == "MF":
1179
  output.append("Subject")
1180
  elif condition == "FM":
1181
  output.append("Object")
1182
  else:
1183
  output.append("Other")
1184
- elif female == 1 and male ==0:
1185
  if condition == "MF":
1186
  output.append("Object")
1187
  elif condition == "FM":
@@ -1190,28 +792,28 @@ class EvaluationModel:
1190
  output.append("Other")
1191
 
1192
  '''Exp10'''
1193
- elif summaries_df["Experiment"][i] == "E10":
1194
- # Remove periods from the response
1195
- rs = rs.replace(".", "").lower() # Convert to lowercase to ensure case-insensitivity
1196
- print("E10", rs)
1197
-
1198
- # Check if the response contains "yes"
1199
- if "yes" in rs:
1200
  output.append("1")
1201
  else:
1202
  output.append("0")
1203
  else:
1204
- print("cant find the Exp:", summaries_df["Experiment"][i])
1205
  output.append("NA")
1206
  # print(output)
1207
  # exit()
1208
  '''human'''
1209
- # self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], summaries_df["Coding"], output)),
1210
  # columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
1211
  '''LLM'''
1212
- self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
1213
- columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
1214
- print(self.data.head())
 
 
 
1215
 
1216
  return self.data
1217
 
@@ -1332,55 +934,8 @@ class EvaluationModel:
1332
 
1333
  return all_results
1334
 
1335
- # ### Calculate Average JS Divergence ###
1336
- #
1337
- # # Extract the relevant columns for JS divergence calculation
1338
- # human_responses = human_df[['Question_ID', 'Coding']]
1339
- # llm_responses = llm_df[['Question_ID', 'Coding']]
1340
- #
1341
- # # Get unique Question_IDs present in both datasets
1342
- # common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
1343
- #
1344
- # # Initialize a list to store JS divergence for each Question_ID
1345
- # js_divergence_list = []
1346
- # js_divergence ={}
1347
- #
1348
- # # Calculate JS divergence for each common Question_ID
1349
- # for q_id in common_question_ids:
1350
- # # Get response distributions for the current Question_ID in both datasets
1351
- # human_dist = human_responses[human_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
1352
- # llm_dist = llm_responses[llm_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
1353
- #
1354
- # # Reindex the distributions to have the same index, filling missing values with 0
1355
- # all_responses = set(human_dist.index).union(set(llm_dist.index))
1356
- # human_dist = human_dist.reindex(all_responses, fill_value=0)
1357
- # llm_dist = llm_dist.reindex(all_responses, fill_value=0)
1358
- #
1359
- # # Calculate JS divergence and add to the list
1360
- # js_div = jensenshannon(human_dist, llm_dist, base=2)
1361
- # experiment_id = q_id.split('_')[1]
1362
- # if experiment_id not in js_divergence:
1363
- # js_divergence[experiment_id] = []
1364
- # js_divergence[experiment_id].append(js_div)
1365
- #
1366
- # js_divergence_list.append(js_div)
1367
- # #js_divergence[q_id] = js_div
1368
- #
1369
- #
1370
- #
1371
- # # Calculate the average JS divergence
1372
- # # JS per experiment
1373
- # avg_js_divergence_per_experiment = {exp: 1- np.nanmean(divs) for exp, divs in js_divergence.items()}
1374
- # print(avg_js_divergence_per_experiment)
1375
- #
1376
- # # JS overall
1377
- # avg_js_divergence = 1 - np.nanmean(js_divergence_list)
1378
- # print("avg_js_divergence:", avg_js_divergence)
1379
- #
1380
- # return avg_js_divergence
1381
-
1382
-
1383
- def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
1384
  '''
1385
  evaluate humanlike score
1386
  1. code the result
@@ -1401,7 +956,7 @@ class EvaluationModel:
1401
 
1402
  '''coding llm data'''
1403
  save_path = result_save_path.replace('.csv','_coding.csv')
1404
- self.llm_df = self.code_results_llm(summaries_df)
1405
 
1406
 
1407
 
@@ -1412,7 +967,7 @@ class EvaluationModel:
1412
  self.llm_df.to_csv(fpath)
1413
 
1414
  envs.API.upload_file(
1415
- path_or_fileobj= save_path,#./generation_results/meta-llama/Llama-2-13b-chat-hf_coding.csv
1416
  path_in_repo=f"{save_path.replace('generation_results/','')}",#
1417
  repo_id=envs.RESULTS_REPO,
1418
  repo_type="dataset",
@@ -1426,111 +981,3 @@ class EvaluationModel:
1426
 
1427
 
1428
 
1429
-
1430
-
1431
-
1432
-
1433
-
1434
-
1435
-
1436
-
1437
-
1438
-
1439
-
1440
-
1441
-
1442
-
1443
-
1444
- def evaluate_hallucination(self, summaries_df):
1445
- """
1446
- Evaluate the hallucination rate in summaries. Updates the 'scores' attribute
1447
- of the instance with the computed scores.
1448
-
1449
- Args:
1450
- summaries_df (DataFrame): DataFrame containing source docs and summaries.
1451
-
1452
- Returns:
1453
- list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
1454
- """
1455
- hem_scores = []
1456
- sources = []
1457
- summaries = []
1458
- source_summary_pairs = util.create_pairs(summaries_df)
1459
- '''评价模型结果'''
1460
- for doc, summary in tqdm(source_summary_pairs, desc="Evaluating Humanlikeness"):
1461
- if util.is_summary_valid(summary):
1462
- try:
1463
- summary = summary.replace('<bos>','').replace('<eos>','')
1464
- score = self.model.predict([doc, summary])# [0]
1465
- if not isinstance(score, float):
1466
- try:
1467
- score = score.item()
1468
- except:
1469
- logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
1470
- continue
1471
- hem_scores.append(score)
1472
- sources.append(doc)
1473
- summaries.append(summary)
1474
- except Exception as e:
1475
- logging.error(f"Error while running HEM: {e}")
1476
- raise
1477
-
1478
- self.scores = hem_scores
1479
- eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
1480
- return hem_scores, eval_results
1481
- # for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
1482
- # if util.is_summary_valid(summary):
1483
- # try:
1484
- # # summary_pieces = summary.split('\n')
1485
- # # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
1486
- # summary = summary.replace('<bos>','').replace('<eos>','')
1487
- # # print([doc, summary])
1488
- # # print(self.model.predict([doc, summary]))
1489
- # score = self.model.predict([doc, summary])# [0]
1490
- # if not isinstance(score, float):
1491
- # try:
1492
- # score = score.item()
1493
- # except:
1494
- # logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
1495
- # continue
1496
- # hem_scores.append(score)
1497
- # sources.append(doc)
1498
- # summaries.append(summary)
1499
- # except Exception as e:
1500
- # logging.error(f"Error while running HEM: {e}")
1501
- # raise
1502
-
1503
- # self.scores = hem_scores
1504
- # eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
1505
- # return hem_scores, eval_results
1506
-
1507
-
1508
- def compute_factual_consistency_rate(self, threshold=0.5):
1509
- """
1510
- Compute the factual consistency rate of the evaluated summaries based on
1511
- the previously calculated scores. This method relies on the 'scores'
1512
- attribute being populated, typically via the 'evaluate_hallucination' method.
1513
-
1514
- Returns:
1515
- float: Factual Consistency Rate. Also updates the 'factual_consistency_rate'
1516
- and 'hallucination_rate' attributes of the instance.
1517
-
1518
- Raises:
1519
- ValueError: If scores have not been calculated prior to calling this method.
1520
- """
1521
- if not self.scores:
1522
- error_msg = "Scores not calculated. Call evaluate_hallucination() first."
1523
- logging.error(error_msg)
1524
- raise ValueError(error_msg)
1525
-
1526
- # Use threshold of 0.5 to compute factual_consistency_rate
1527
- num_above_threshold = sum(score >= threshold for score in self.scores)
1528
- num_total = len(self.scores)
1529
-
1530
- if not num_total:
1531
- raise ValueError("No scores available to compute factual consistency rate.")
1532
-
1533
- self.factual_consistency_rate = (num_above_threshold / num_total) * 100
1534
- self.hallucination_rate = 100 - self.factual_consistency_rate
1535
-
1536
- return self.factual_consistency_rate
 
6
  import requests
7
  import json
8
 
9
+ # import numpy as np
10
  import pandas as pd
11
  import spacy
12
  from sentence_transformers import CrossEncoder
 
43
  # Load spacy model for word tokenization
44
  # nlp = spacy.load("en_core_web_sm")
45
  try:
46
+ nlp1 = spacy.load("en_core_web_trf")
47
  except OSError:
48
  print("无法加载模型,继续执行其他处理。")
49
 
 
55
 
56
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  class ModelLoadingException(Exception):
59
  """Exception raised for errors in loading a model.
60
 
 
69
  super().__init__(f"{messages} id={model_id} revision={revision}")
70
 
71
 
72
+ class ResponseGenerator:
73
+ """A class to generate responses using a causal language model.
74
 
75
  Attributes:
76
  model (str): huggingface/{model_id}
77
  api_base (str): https://api-inference.huggingface.co/models/{model_id}
78
+ responses_df (DataFrame): DataFrame to store generated responses.
79
  revision (str): Model revision.
80
+ avg_length (float): Average length of responses.
81
+ answer_rate (float): Rate of non-empty responses.
82
  """
83
 
84
  def __init__(self, model_id, revision):
85
  """
86
+ Initializes the ResponseGenerator with a model.
87
 
88
  Args:
89
  model_id (str): Identifier for the model.
 
92
  self.model_id = model_id
93
  self.model = f"huggingface/{model_id}"
94
  self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
95
+ self.responses_df = pd.DataFrame()
96
  self.revision = revision
97
  self.avg_length = None
98
  self.answer_rate = None
99
  self.exceptions = None
100
  self.local_model = None
101
 
102
+ def generate_response(self, dataset, df_prompt, save_path=None):
103
+ """Generate responses for a given DataFrame of source docs.
 
104
  Args:
105
  dataset (DataFrame): DataFrame containing source docs.
106
 
107
  Returns:
108
+ responses_df (DataFrame): Generated responses by the model.
109
  """
110
  exceptions = []
111
  if (save_path is not None) and os.path.exists(save_path):
112
  '''已存在文件,可以读取已经存在的测试文本'''
113
+ self.responses_df = pd.read_csv(save_path)
114
+ # print(self.responses_df['Experiment'])
115
 
116
+ print(f'Loaded generated responses from {save_path}')
117
  else:
118
  '''测试文件不存在,则需要调用指定的模型来进行测试'''
119
  # prompt = {}
 
176
  while True:
177
  try:
178
  '''调用'''
179
+ print(self.model_id.lower(),'-',ID,'-',j,'-',ii)
180
 
181
+ _response = self.send_request(system_prompt, _user_prompt)
182
  # print(f"Finish index {index}")
183
  break
184
  except Exception as e:
 
204
  print(f"Error at index {i}: {e}")
205
  time.sleep(wait_time)
206
  try:
207
+ _response = self.send_request(system_prompt, _user_prompt)
208
  break
209
  except Exception as ee:
210
  exceptions.append(ee)
 
219
  break
220
  if i == 5:
221
  #print(_response)
222
+ # For E5, the responses might be in the following formats:
223
+ # "Sure\n\nThe first sentence of the response\n\nThe second sentence of the response"
224
+ # "The first sentence of the response\n\nThe second sentence of the response"
225
+ # "XXX: The first sentence of the response\n\nXXX: The second sentence of the response"
226
+ # "Sure\n\nXXX: The first sentence of the response\n\nXXX: The second sentence of the response"
227
+ # "Sure\n\nThe first sentence of the response\n\nThe second sentence of the response\n\n"
228
 
229
  def extract_responses(text, trigger_words=None):
230
  if trigger_words is None:
231
  trigger_words = ["sure", "okay", "yes"]
232
 
233
  try:
234
+ # Split the text into sentences
235
  sentences = text.split('\n')
236
+ # Remove empty sentences
237
  sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
238
+ # Remove the first sentence if it has a : in it,
239
  sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence for
240
  sentence in sentences]
241
+ # Remove empty sentences
242
+ sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
243
+ # Remove the first sentence if it is a trigger word
244
+ if any(sentences[0].lower().startswith(word) for word in trigger_words) and len(
245
+ sentences) > 2:
246
  _response1 = sentences[1].strip() if len(sentences) > 1 else None
247
  _response2 = sentences[2].strip() if len(sentences) > 2 else None
248
  else:
249
  _response1 = sentences[0].strip() if len(sentences) > 0 else None
250
  _response2 = sentences[1].strip() if len(sentences) > 1 else None
251
 
252
+
253
  except Exception as e:
254
  print(f"Error occurred: {e}")
255
  _response1, _response2 = None, None
256
 
257
+ print(_response1), print(_response2)
258
 
259
  return _response1, _response2
260
 
261
  _response1, _response2 = extract_responses(_response)
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
  Experiment_ID.append(ID)
264
  Questions_ID.append(q_column[j])
 
293
  Stimuli_1.append(Stimuli_1_column[j])
294
  Item_ID.append(Item_column[j])
295
  Condition.append(Condition_column[j])
 
 
 
 
296
 
297
  # Sleep to prevent hitting rate limits too frequently
298
  time.sleep(1)
299
 
300
+ self.responses_df = pd.DataFrame(list(zip(Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1)),
301
+ columns=["Experiment", "Question_ID", "Item", "Condition", "User_prompt", "Response","Factor 2","Stimuli 1"])
302
 
303
  if save_path is not None:
304
+ print(f'Save responses to {save_path}')
305
  fpath = Path(save_path)
306
  fpath.parent.mkdir(parents=True, exist_ok=True)
307
+ self.responses_df.to_csv(fpath)
308
 
309
  self.exceptions = exceptions
310
  # self._compute_avg_length()
311
  # self._compute_answer_rate()
312
 
313
+ return self.responses_df
314
 
315
+ def send_request(self, system_prompt: str, user_prompt: str):
316
  # Using Together AI API
317
  using_together_api = False
318
  together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm']
 
334
  "model": self.model_id,
335
  # "max_tokens": 4096,
336
  'max_new_tokens': 100,
337
+ # "a": 0.0,
338
  # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
339
  }
 
 
 
 
 
 
 
 
340
  payload['messages'] = [{"role": "system", "content": system_prompt},
341
  {"role": "user", "content": user_prompt}]
342
  headers = {
 
434
  continue
435
 
436
  raise Exception("All tokens failed.")
437
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  elif 'gemini' in self.model_id.lower():
439
  genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
440
  generation_config = {
441
+ # "temperature": 0,
442
+ # "top_p": 0.95, # cannot change
443
+ # "top_k": 0,
444
  "max_output_tokens": 100,
445
  # "response_mime_type": "application/json",
446
  }
 
492
  # Using local model
493
 
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
 
497
  class EvaluationModel:
498
+ """A class to evaluate generated responses.
499
 
500
  Attributes:
501
  model (CrossEncoder): The evaluation model.
502
+ scores (list): List of scores for the responses.
503
+ humanlike_score (float): Human-likeness score
504
+
505
+
506
  """
507
 
508
  def __init__(self, model_path):
509
  """
510
+ Initializes the EvaluationModel.
 
 
 
511
  """
512
+ # self.model = load_evaluation_model(model_path)
513
  self.scores = []
 
 
514
  self.humanlike_score = None
515
 
516
+ def code_results_llm(self, responses_df):
517
  '''code results from LLM's response'''
518
  output = []
519
  '''database for Exp4'''
 
534
  Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
535
  Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
536
 
 
537
  male_keyword = ["he", "his", "himself"]
538
  female_keyword = ["she", "her", "herself"]
539
+ #print(len(responses_df["Experiment"]))
540
+ for i in range(len(responses_df["Experiment"])):
541
+ print(i, "/", len(responses_df["Experiment"]))
542
  # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
543
  # print()
544
+ if pd.isna(responses_df["Response"][i]):
545
  output.append("Other")
546
  continue
547
+ rs = responses_df["Response"][i].strip().lower()
548
+ rs = rs.replace('"', '').replace(" ", " ").replace('.', '')
549
+ lines = rs.split("\n")
550
+ filtered_lines = [line for line in lines if line and not (line.endswith(":") or line.endswith(":"))]
551
+ filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for
552
+ r in filtered_lines]
553
+ rs = "\n".join(filtered_lines)
554
+ rs = rs.strip()
555
  '''Exp1'''
556
+ if responses_df["Experiment"][i] == "E1":
557
+ #print("E1", rs)
 
 
558
  if rs == "round":
559
  # vote_1_1 += 1
560
  output.append("Round")
 
563
  else:
564
  output.append("Other")
565
 
 
566
  '''Exp2'''
567
+
568
+ elif responses_df["Experiment"][i] == "E2":
569
+ # rs = responses_df["Response"][i].strip()
570
  rs = rs.split(' ')
571
+ #print("E2", rs)
572
  male, female = 0, 0
573
  for word in rs:
574
  if word in female_keyword and male == 0:
 
579
  male = 1
580
  output.append("Male")
581
  break
582
+ if male == 0 and female == 0:
583
  output.append("Other")
584
 
585
  '''Exp3'''
586
+ elif responses_df["Experiment"][i] == "E3":
587
+ # rs = responses_df["Response"][i].strip()
588
+ #print("E3", rs)
589
+ pair = responses_df["Factor 2"][i]
590
+ word1, word2 = pair.replace(".", "").split('_')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
591
 
592
+ if responses_df["Item"][i] == 12:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  output.append("Other")
 
594
  else:
595
+ words = rs.split() # split the response into words
596
+ output = []
597
+ if any(word == word1 for word in words) and any(word == word2 for word in words):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
  output.append("Other")
599
+ else:
600
+ if any(word.lower() == word1.lower() for word in words):
601
+ if len(word1) > len(word2):
602
+ output.append("Long")
603
+ else:
604
+ output.append("Short")
605
+ elif any(word.lower() == word2.lower() for word in words):
606
+ if len(word1) > len(word2):
607
+ output.append("Short")
608
+ else:
609
+ output.append("Long")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  else:
611
  output.append("Other")
612
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
  '''Exp4'''
614
 
615
+ elif responses_df["Experiment"][i] == "E4":
616
+ filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for r in rs.split("\n")]
617
+ filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
618
+ rs = "\n".join(filtered_lines)
619
+
620
+ filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for r in rs.split(";")]
621
+ filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
622
+ rs = ";".join(filtered_lines).strip()
623
  try:
624
  meaning_word = rs.split(";")[4].replace(" ", '')
625
  except IndexError:
626
+ try:
627
+ meaning_word = rs.split("\n")[4].replace(" ", '')
628
+ except IndexError:
629
+ output.append("Other")
630
+ continue
631
  except Exception as e:
632
  print(f"Unexpected error: {e}")
633
  output.append("Other")
634
  continue
635
+
636
+ target = responses_df["Factor 2"][i].strip().lower()
 
637
  pair = target + "_" + meaning_word
638
+ #print("E4:", pair)
639
 
640
  if pair in wordpair2code.keys():
641
  output.append(wordpair2code[pair])
 
643
  output.append("Other")
644
 
645
  '''Exp5'''
646
+ elif responses_df["Experiment"][i] == "E5" or responses_df["Experiment"][i] == "E51":
647
+ # sentence = responses_df["Response"][i].strip()
648
+ item_id = responses_df["Item"][i]
649
+ question_id = responses_df["Question_ID"][i]
650
 
651
  sti1, sti2 = "", ""
652
 
653
+ if responses_df["Experiment"][i] == "E51":
654
  sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
655
+ #sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
656
  verb = item2verb1[item_id].lower()
657
 
658
  sentence = sti1 + " " + rs.replace(sti1, "")
659
+ #print("E5", verb, sentence)
660
+ if responses_df["Experiment"][i] == "E5":
661
+ #sti1 = Stimuli1[question_id].lower().replace("...", "")
662
+ # print(sti1)
663
  sti2 = Stimuli2[question_id].lower().replace("...", "")
664
 
665
  verb = item2verb2[item_id].lower()
666
+ sentence = sti2 + " " + rs.replace(sti2, "")
667
+ #print("E5", verb, sentence)
668
 
669
+ doc = nlp1(sentence.replace(" ", " "))
 
670
  # print(doc)
671
  # print()
672
  verb_token = None
 
676
  verb_token = token
677
  break
678
  # exit()
679
+ pobj, dative = None, None
680
+ # print(verb_token.children)
681
+ # exit()
682
+ if verb_token is not None:
 
 
 
683
  for child in verb_token.children:
684
+ # print(child)
685
+ if (child.dep_ == 'dative' and child.pos_ == "ADP") or (
686
+ child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
687
  pobj = child.text
688
  if child.dep_ == 'dative':
689
  dative = child.text
690
+
691
+ # print("E5", pobj, dative)
692
+ # exit()
693
+
694
+ if pobj:
695
+ output.append("PO")
696
+ elif dative:
697
+ output.append("DO")
698
+ else:
699
+ # print("Other", sentence, pobj, dative)
700
  # exit()
701
+ output.append("Other")
702
+
703
 
 
 
 
 
 
 
 
 
704
 
705
  '''Exp6'''
706
 
707
+ elif responses_df["Experiment"][i] == "E6":
708
+ sentence = responses_df["Stimuli 1"][i].strip().lower()
709
+ #print("E6", sentence)
710
  doc = nlp1(sentence)
711
  subject = "None"
712
  obj = "None"
713
+
714
+
715
  for token in doc:
716
  if token.dep_ == "nsubj":
717
  subject = token.text
718
  elif token.dep_ == "dobj":
719
  obj = token.text
720
+ #print("E6", subject, obj)
721
  if subject in rs and obj in rs:
722
+ #print(rs, subject, obj, "Other")
723
  output.append("Other")
724
  elif subject in rs:
725
+ #print(rs, subject, obj, "VP")
726
  output.append("VP")
727
  elif obj in rs:
728
+ #print(rs, subject, obj, "NP")
729
  output.append("NP")
730
  else:
731
+ #print(rs, subject, obj, "Other")
732
  output.append("Other")
733
 
 
 
 
734
  '''Exp7'''
735
+ elif responses_df["Experiment"][i] == "E7":
736
+ # rs = responses_df["Response"][i].strip().lower()
737
  rs = rs.replace(".", "").replace(",", "").lower()
738
+ #print("E7", rs)
739
+ if "yes" in rs and "no" in rs:
740
+ output.append("Other")
741
+ elif "no" in rs:
742
+ output.append("0")
743
+ elif "yes" in rs:
744
+ output.append("1")
745
+ else:
 
 
 
 
 
 
 
 
746
  output.append("Other")
747
 
748
  '''Exp8'''
749
+ elif responses_df["Experiment"][i] == "E8":
750
+ # rs = responses_df["Response"][i].strip()
751
+ #print("E8", rs)
752
  if "something is wrong with the question" in rs:
753
  output.append("1")
754
  else:
755
  output.append("0")
756
 
757
  '''Exp9'''
758
+ elif responses_df["Experiment"][i] == "E9":
759
  male, female = 0, 0
760
 
761
+ # rs = responses_df["Response"][i].strip()
762
  if "because" in rs:
763
+ rs = rs.replace("because because", "because").split("because")[1]
764
  else:
765
  rs = rs
766
+ condition = responses_df["Factor 2"][i].strip()
767
  rs = rs.split(" ")
768
  for w in rs:
769
  if w in male_keyword and female != 1:
 
772
  if w in female_keyword and male != 1:
773
  female = 1
774
  break
775
+ #print("E9", "condition", condition, "male", male, "female", female)
776
+ if male == 0 and female == 0:
777
  output.append('Other')
778
  else:
779
+ if male == 1 and female == 0:
780
  if condition == "MF":
781
  output.append("Subject")
782
  elif condition == "FM":
783
  output.append("Object")
784
  else:
785
  output.append("Other")
786
+ elif female == 1 and male == 0:
787
  if condition == "MF":
788
  output.append("Object")
789
  elif condition == "FM":
 
792
  output.append("Other")
793
 
794
  '''Exp10'''
795
+ elif responses_df["Experiment"][i] == "E10":
796
+ # rs = responses_df["Response"][i].strip()
797
+ rs = rs.replace(".", "")
798
+ if rs == "yes":
 
 
 
799
  output.append("1")
800
  else:
801
  output.append("0")
802
  else:
803
+ #print("can;t find the Exp:", responses_df["Experiment"][i])
804
  output.append("NA")
805
  # print(output)
806
  # exit()
807
  '''human'''
808
+ # self.data = pd.DataFrame(list(zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"], responses_df["Factor 2"], responses_df["Stimuli 1"], responses_df["Coding"], output)),
809
  # columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
810
  '''LLM'''
811
+ # print(len(output))
812
+ self.data = pd.DataFrame(list(
813
+ zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"],
814
+ responses_df["Factor 2"], responses_df["Stimuli 1"], output)),
815
+ columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Stimuli 1",
816
+ "Coding"])
817
 
818
  return self.data
819
 
 
934
 
935
  return all_results
936
 
937
+
938
+ def evaluate_humanlike(self, responses_df: pd.DataFrame, human_data_path: object, result_save_path: str) -> object:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
939
  '''
940
  evaluate humanlike score
941
  1. code the result
 
956
 
957
  '''coding llm data'''
958
  save_path = result_save_path.replace('.csv','_coding.csv')
959
+ self.llm_df = self.code_results_llm(responses_df)
960
 
961
 
962
 
 
967
  self.llm_df.to_csv(fpath)
968
 
969
  envs.API.upload_file(
970
+ path_or_fileobj=save_path,#./generation_results/meta-llama/Llama-2-13b-chat-hf_coding.csv
971
  path_in_repo=f"{save_path.replace('generation_results/','')}",#
972
  repo_id=envs.RESULTS_REPO,
973
  repo_type="dataset",
 
981
 
982
 
983
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/util.py CHANGED
@@ -1,81 +1,3 @@
1
- def is_summary_valid(summary: str) -> bool:
2
- """
3
- Checks if the summary is valid.
4
-
5
- A summary is valid if it is not empty and contains at least five words.
6
-
7
- Args:
8
- summary (str): The summary to check.
9
-
10
- Returns:
11
- bool: True if the summary is valid, False otherwise.
12
- """
13
- if isinstance(summary, str):
14
- words = summary.split()
15
- if len(words) >= 5:
16
- return True
17
- # print(summary)
18
- return False
19
-
20
-
21
- def create_pairs(df):
22
- """
23
- Creates pairs of source and summary from the dataframe.
24
-
25
- Args:
26
- df (DataFrame): The dataframe containing source and summary columns.
27
-
28
- Returns:
29
- list: A list of pairs [source, summary].
30
- """
31
- pairs = []
32
- for _, row in df.iterrows():
33
- pairs.append([row['source'], row['summary']])
34
-
35
- return pairs
36
-
37
-
38
- # def format_results(model_name: str, revision: str, precision: str,
39
- # factual_consistency_rate: float, hallucination_rate: float,
40
- # answer_rate: float, avg_summary_len: float) -> dict:
41
- # """
42
- # Formats the evaluation results into a structured dictionary.
43
- #
44
- # Args:
45
- # model_name (str): The name of the evaluated model.
46
- # revision (str): The revision hash of the model.
47
- # precision (str): The precision with which the evaluation was run.
48
- # factual_consistency_rate (float): The factual consistency rate.
49
- # hallucination_rate (float): The hallucination rate.
50
- # answer_rate (float): The answer rate.
51
- # avg_summary_len (float): The average summary length.
52
- #
53
- # Returns:
54
- # dict: A dictionary containing the structured evaluation results.
55
- # """
56
- # results = {
57
- # "config": {
58
- # "model_dtype": precision, # Precision with which you ran the evaluation
59
- # "model_name": model_name, # Name of the model
60
- # "model_sha": revision # Hash of the model
61
- # },
62
- # "results": {
63
- # "hallucination_rate": {
64
- # "hallucination_rate": round(hallucination_rate,3)
65
- # },
66
- # "factual_consistency_rate": {
67
- # "factual_consistency_rate": round(factual_consistency_rate,1)
68
- # },
69
- # "answer_rate": {
70
- # "answer_rate": round(answer_rate*100,1)
71
- # },
72
- # "average_summary_length": {
73
- # "average_summary_length": round(avg_summary_len,1)
74
- # },
75
- # }
76
- # }
77
- #
78
- # return results
79
 
80
  def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
81
  """
@@ -97,7 +19,7 @@ def format_results(model_name: str, revision: str, precision: str, overall_js: f
97
  "config": {
98
  "model_dtype": precision, # Precision with which you ran the evaluation
99
  "model_name": model_name, # Name of the model
100
- "model_sha": revision # Hash of the model
101
  },
102
  "results": {
103
  "overall_js_divergence": overall_js, # Overall JS divergence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
3
  """
 
19
  "config": {
20
  "model_dtype": precision, # Precision with which you ran the evaluation
21
  "model_name": model_name, # Name of the model
22
+ #"model_sha": revision # Hash of the model
23
  },
24
  "results": {
25
  "overall_js_divergence": overall_js, # Overall JS divergence
src/display/about.py CHANGED
@@ -33,10 +33,6 @@ class Tasks(Enum):
33
  E10 = Task("E10", "E10", "E10 Humanlike %")
34
  E10_ci = Task("E10_ci", "E10_ci", "E10 CI")
35
 
36
- # factual_consistency_rate = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate (%)")
37
- # answer_rate = Task("answer_rate", "answer_rate", "Answer Rate (%)")
38
- # average_summary_length = Task("average_summary_length",
39
- # "average_summary_length", "Average Summary Length")
40
 
41
 
42
  # Your leaderboard name
@@ -59,18 +55,6 @@ To quantify the similarity, we collected responses from 2000 human participants,
59
  To measure the similarity between human and model responses, we utilize the Jensen-Shannon (JS) divergence. This method allows us to compare the two binomial distributions (one from human responses and one from model responses) for each stimulus.
60
  The similarity is quantified by calculating 1 minus the JS divergence, where a value closer to 1 indicates higher similarity.
61
 
62
- ## Evaluation Dataset
63
-
64
- Our evaluation dataset consists of 1006 documents from multiple public datasets, primarily [CNN/Daily Mail Corpus](https://huggingface.co/datasets/cnn_dailymail/viewer/1.0.0/test).
65
- We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
66
-
67
- ## Metrics Explained
68
- - Individual Task Similarity: For each psycholinguistic task, we calculate the humanlike score for each stimulus, providing a measure of how closely the model’s responses resemble those of humans.
69
- - Average Similarity: The average of the humanlike scores across all stimuli and tasks, giving an overall indication of the model’s performance in mimicking human language use.
70
-
71
- ## Note on non-Hugging Face models
72
- On HHEM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
73
- If you would like to submit your model that is not available on the Hugging Face model hub, please contact us at minseok@vectara.com.
74
 
75
  ## Model Submissions and Reproducibility
76
  You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
@@ -101,27 +85,26 @@ After the evaluation, results are saved in "eval-results-bk/your_model_id/result
101
  ## Results Format
102
  The results are structured in JSON as follows:
103
  ```python
104
- {
105
- "config": {
106
- "model_dtype": "float16",
107
- "model_name": "your_model_id",
108
- "model_sha": "main"
109
- },
110
- "results": {
111
- "hallucination_rate": {
112
- "hallucination_rate": ...
113
- },
114
- "factual_consistency_rate": {
115
- "factual_consistency_rate": ...
116
- },
117
- "answer_rate": {
118
- "answer_rate": ...
119
- },
120
- "average_summary_length": {
121
- "average_summary_length": ...
 
122
  }
123
- }
124
- }
125
  ```
126
  For additional queries or model submissions, please contact xufeng.duan@link.cuhk.edu.hk.
127
  """
 
33
  E10 = Task("E10", "E10", "E10 Humanlike %")
34
  E10_ci = Task("E10_ci", "E10_ci", "E10 CI")
35
 
 
 
 
 
36
 
37
 
38
  # Your leaderboard name
 
55
  To measure the similarity between human and model responses, we utilize the Jensen-Shannon (JS) divergence. This method allows us to compare the two binomial distributions (one from human responses and one from model responses) for each stimulus.
56
  The similarity is quantified by calculating 1 minus the JS divergence, where a value closer to 1 indicates higher similarity.
57
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  ## Model Submissions and Reproducibility
60
  You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
 
85
  ## Results Format
86
  The results are structured in JSON as follows:
87
  ```python
88
+ {
89
+ "config": {
90
+ "model_dtype": "BF16",
91
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.1",
92
+ "model_sha": ""
93
+ },
94
+ "results": {
95
+ "overall_js_divergence": 0.6129438385008659,
96
+ "overall_confidence_interval": [
97
+ 0.5937234777290732,
98
+ 0.6317188731175192
99
+ ],
100
+ "E9": 0.7768461816966632,
101
+ "E9_ci": [
102
+ 0.7474754730701578,
103
+ 0.8058680968641126
104
+ ],
105
+ ...
106
+ }
107
  }
 
 
108
  ```
109
  For additional queries or model submissions, please contact xufeng.duan@link.cuhk.edu.hk.
110
  """
src/display/utils.py CHANGED
@@ -45,7 +45,7 @@ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub Licen
45
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
46
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
47
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
48
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
49
  # Dummy column for the search bar (hidden by the custom CSS)
50
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
51
 
 
45
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
46
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
47
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
48
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
49
  # Dummy column for the search bar (hidden by the custom CSS)
50
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
51
 
src/envs.py CHANGED
@@ -53,7 +53,6 @@ API = HfApi(token=TOKEN)
53
 
54
  DATASET_PATH = "./src/datasets/Material_Llama2_0603.xlsx" #experiment data
55
  PROMPT_PATH = "./src/datasets/prompt.xlsx" #prompt for each experiment
56
- HEM_PATH = 'vectara/hallucination_evaluation_model'
57
  HUMAN_DATA = "./src/datasets/human_data_coding.csv" #experiment data
58
  ITEM_4_DATA = "./src/datasets/associataion_dataset.csv" #database
59
  ITEM_5_DATA = "./src/datasets/Items_5.csv" #experiment 5 need verb words
@@ -61,5 +60,4 @@ ITEM_5_DATA = "./src/datasets/Items_5.csv" #experiment 5 need verb words
61
  # SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
62
  SYSTEM_PROMPT = "You are participating in a psycholinguistic experiment. You will complete a task on English language use. Please respond to the questions directly, without using introductory phrases (e.g., Sure or OK) or special formats at the beginning of your responses."
63
  '''prompt'''
64
- # USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
65
  USER_PROMPT = ""
 
53
 
54
  DATASET_PATH = "./src/datasets/Material_Llama2_0603.xlsx" #experiment data
55
  PROMPT_PATH = "./src/datasets/prompt.xlsx" #prompt for each experiment
 
56
  HUMAN_DATA = "./src/datasets/human_data_coding.csv" #experiment data
57
  ITEM_4_DATA = "./src/datasets/associataion_dataset.csv" #database
58
  ITEM_5_DATA = "./src/datasets/Items_5.csv" #experiment 5 need verb words
 
60
  # SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
61
  SYSTEM_PROMPT = "You are participating in a psycholinguistic experiment. You will complete a task on English language use. Please respond to the questions directly, without using introductory phrases (e.g., Sure or OK) or special formats at the beginning of your responses."
62
  '''prompt'''
 
63
  USER_PROMPT = ""
src/leaderboard/read_evals.py CHANGED
@@ -85,16 +85,15 @@ class EvalResult:
85
  if isinstance(v, (int, float)) and not math.isnan(v):
86
  accs.append(np.around(v * 100, decimals=0))
87
  elif isinstance(v, list):
88
- # 处理列表中的每个数值
89
  accs.extend([np.around(x * 100, decimals=0) for x in v if
90
  isinstance(x, (int, float)) and not math.isnan(x)])
91
  else:
92
  # 跳过 NaN 或不符合条件的值
93
  accs.append(None)
94
 
95
- # 去掉 None 值(如果有)
96
  accs = np.array([x for x in accs if x is not None])
97
- # 过滤掉 None 值,确保 accs 只包含有效的数值
98
  accs = accs[accs != None]
99
 
100
  results[task.benchmark] = accs
@@ -168,7 +167,7 @@ class EvalResult:
168
  utils.AutoEvalColumn.architecture.name: self.architecture,
169
  utils.AutoEvalColumn.model.name: formatting.make_clickable_model(self.full_model),
170
  utils.AutoEvalColumn.dummy.name: self.full_model,
171
- utils.AutoEvalColumn.revision.name: self.revision,
172
  utils.AutoEvalColumn.license.name: self.license,
173
  utils.AutoEvalColumn.likes.name: self.likes,
174
  utils.AutoEvalColumn.params.name: self.num_params,
 
85
  if isinstance(v, (int, float)) and not math.isnan(v):
86
  accs.append(np.around(v * 100, decimals=0))
87
  elif isinstance(v, list):
 
88
  accs.extend([np.around(x * 100, decimals=0) for x in v if
89
  isinstance(x, (int, float)) and not math.isnan(x)])
90
  else:
91
  # 跳过 NaN 或不符合条件的值
92
  accs.append(None)
93
 
94
+
95
  accs = np.array([x for x in accs if x is not None])
96
+
97
  accs = accs[accs != None]
98
 
99
  results[task.benchmark] = accs
 
167
  utils.AutoEvalColumn.architecture.name: self.architecture,
168
  utils.AutoEvalColumn.model.name: formatting.make_clickable_model(self.full_model),
169
  utils.AutoEvalColumn.dummy.name: self.full_model,
170
+ # utils.AutoEvalColumn.revision.name: self.revision,
171
  utils.AutoEvalColumn.license.name: self.license,
172
  utils.AutoEvalColumn.likes.name: self.likes,
173
  utils.AutoEvalColumn.params.name: self.num_params,
src/populate.py CHANGED
@@ -19,7 +19,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
19
  print("all results:",df.to_string())
20
  # exit()
21
  try:
22
- df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
23
  df = df[cols].round(decimals=2)
24
  # filter out if any of the benchmarks have not been produced
25
  df = df[formatting.has_no_nan_values(df, benchmark_cols)]
 
19
  print("all results:",df.to_string())
20
  # exit()
21
  try:
 
22
  df = df[cols].round(decimals=2)
23
  # filter out if any of the benchmarks have not been produced
24
  df = df[formatting.has_no_nan_values(df, benchmark_cols)]