JiaenLiu commited on
Commit
11ff02b
1 Parent(s): dca0a7c

some updates for scores.

Browse files

Former-commit-id: 5d6b5d325b183e15512755c8f6accc4909b7f8a1

evaluation/scores/LLM_eval.py CHANGED
@@ -13,35 +13,46 @@ from langchain.chat_models import ChatOpenAI
13
  # Load the evaluator
14
 
15
  def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", model="gpt-4-0613"):
16
- llm = ChatOpenAI(temperature=0, model=model)
17
 
18
- lang_str = f"You are an expert {source_lang} to {target_lang} translator specialized in {domain}."
 
 
 
 
 
 
19
 
 
 
20
  fstring = """
21
- You are grading the following question:
22
  {input}
23
- Here is the real answer:
 
 
24
  {reference}
25
- You are grading the following predicted answer:
26
  {output}
27
  based on the following criteria:
28
  {criteria}
29
- Give one grades, accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low accuracy) and 100 is the highest (very high accuracy)?
30
  Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
31
  numerically incorrect this also includes values that have the $ in front
32
  Please give the completeness score first followed by the accuracy score.
33
- For example: Accuracy: 40. Explanation here
 
 
34
  Do not differ from the format ever
35
  """
36
- prompt = PromptTemplate.from_template(lang_str+fstring, template_format="f-string")
37
-
 
 
 
 
 
 
38
 
39
- # Give two grades, one for completness and another for accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low completeness/accuracy) and 100 is the highest (very high completness/accuracy)?
40
- # Do not base the two scores off each other give them the scores independently. Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
41
- # numerically incorrect this also includes values that have the $ in front
42
- # Please give the completeness score first followed by the accuracy score.
43
- # For example: Completeness: 70. Accuracy: 40. Explanation here
44
- # Do not differ from the format ever
45
  return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")
46
 
47
  # prase the output of the evaluation
@@ -55,18 +66,41 @@ def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", mod
55
  # explanation = ".".join(value[1:])
56
  # return int(value[0]), explanation
57
 
58
- def parse_eval_result(eval_result):
59
- # Extract the 'Accuracy' score using a regular expression from the 'reasoning' key
60
- accuracy_match = re.search(r'Accuracy: (\d+)', eval_result['reasoning'])
61
- if accuracy_match:
62
- accuracy = int(accuracy_match.group(1))
63
- else:
64
- accuracy = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- # Directly get the 'Explanation' value from the 'value' key
67
- explanation = eval_result['value']
 
 
 
68
 
69
- return accuracy, explanation
70
 
71
  def evaluate_prediction(input, reference, prediction, evaluator):
72
  eval_result = evaluator.evaluate_strings(
@@ -74,11 +108,14 @@ def evaluate_prediction(input, reference, prediction, evaluator):
74
  input=input,
75
  reference=reference,
76
  )
 
77
  return parse_eval_result(eval_result)
78
 
79
  if __name__ == "__main__":
80
  evaluator = init_evaluator()
81
  # For no input english sentence, just put "" in the input
82
- accuracy, explanation = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
83
- print("Accuracy:", accuracy)
84
- print("Explanation:", explanation)
 
 
 
13
  # Load the evaluator
14
 
15
  def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", model="gpt-4-0613"):
 
16
 
17
+ # map the language code to the language name
18
+ language_map = {
19
+ "en": "English",
20
+ "zh": "Chinese",
21
+ }
22
+
23
+ llm = ChatOpenAI(temperature=0, model=model)
24
 
25
+ # Completeness is the percentage of the input that is translated
26
+ # Accuracy is the percentage of the translation that is correct
27
  fstring = """
28
+ You are grading the translation based on following input:
29
  {input}
30
+ if the input is "", that means there is no input sentence.
31
+ you should grade the translation based on the reference translation:
32
+ Here is the real answer(reference):
33
  {reference}
34
+ You are grading the following translation:
35
  {output}
36
  based on the following criteria:
37
  {criteria}
38
+ Give two grades, accuracy and completeness rate them from a scale of 0 to 100, where 0 is the lowest (very low accuracy/completeness) and 100 is the highest (very high accuracy/completeness)?
39
  Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
40
  numerically incorrect this also includes values that have the $ in front
41
  Please give the completeness score first followed by the accuracy score.
42
+ For example:
43
+ Accuracy: 40. Explanation here
44
+ Completeness: 80. Explanation here
45
  Do not differ from the format ever
46
  """
47
+
48
+ if source_lang in language_map and target_lang in language_map:
49
+ lang_str = f"You are an expert {language_map[source_lang]} to {language_map[target_lang]} translator specialized in {domain}."
50
+ prompt = PromptTemplate.from_template(lang_str+fstring, template_format="f-string")
51
+
52
+ else:
53
+ print("The language code is not supported, please check the language code.")
54
+ prompt = PromptTemplate.from_template(fstring, template_format="f-string")
55
 
 
 
 
 
 
 
56
  return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")
57
 
58
  # prase the output of the evaluation
 
66
  # explanation = ".".join(value[1:])
67
  # return int(value[0]), explanation
68
 
69
+ # def parse_eval_result(eval_result):
70
+ # # Extract the 'Accuracy' score using a regular expression from the 'reasoning' key
71
+ # accuracy_match = re.search(r'Accuracy: (\d+)', eval_result['value'])
72
+ # print(accuracy_match)
73
+ # if accuracy_match:
74
+ # accuracy = int(accuracy_match.group(1))
75
+ # else:
76
+ # # try to get the accuracy from the 'value' key
77
+ # accuracy = 0
78
+
79
+ # # Directly get the 'Explanation' value from the 'value' key
80
+ # explanation = eval_result['value']
81
+
82
+ # return accuracy, explanation
83
+
84
+ def parse_eval_result(data):
85
+ # Extract the value string
86
+ value_str = data.get('value', '')
87
+ reasoning_str = data.get('reasoning', '')
88
+
89
+ # Use regex to extract accuracy value and explanation
90
+ accuracy_match = re.search(r'Accuracy: (\d+)', value_str)
91
+ acc_explanation_match = re.search(r'Accuracy: \d+\. (.+)', value_str)
92
+
93
+ # Use regex to extract completeness value and explanation
94
+ completeness_match = re.search(r'Completeness: (\d+)', reasoning_str)
95
+ completeness_explanation_match = re.search(r'Completeness: \d+\. (.+)', reasoning_str)
96
 
97
+ # Extract the matched groups
98
+ completeness = int(completeness_match.group(1)) if completeness_match else None
99
+ completeness_explanation = completeness_explanation_match.group(1) if completeness_explanation_match else None
100
+ accuracy = int(accuracy_match.group(1)) if accuracy_match else None
101
+ acc_explanation = acc_explanation_match.group(1) if acc_explanation_match else None
102
 
103
+ return (accuracy, acc_explanation), (completeness, completeness_explanation)
104
 
105
  def evaluate_prediction(input, reference, prediction, evaluator):
106
  eval_result = evaluator.evaluate_strings(
 
108
  input=input,
109
  reference=reference,
110
  )
111
+ # print(eval_result)
112
  return parse_eval_result(eval_result)
113
 
114
  if __name__ == "__main__":
115
  evaluator = init_evaluator()
116
  # For no input english sentence, just put "" in the input
117
+ accuracy, completeness = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
118
+ print("Accuracy:", accuracy[0])
119
+ print("Acc_Explanation:", accuracy[1])
120
+ print("Completeness:", completeness[0])
121
+ print("Comp_Explanation:", completeness[1])
evaluation/scores/multi_scores.py CHANGED
@@ -1,26 +1,46 @@
1
  from comet import download_model, load_from_checkpoint
2
  from sacrebleu.metrics import BLEU, CHRF, TER
3
- from scores import LLM_eval
 
4
 
5
  class multi_scores:
6
- def __init__(self, source_lang="English", target_lang="Chinese", domain="starcraft 2") -> None:
7
  self.comet_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
8
- self.bleu_model = BLEU(tokenize="zh")
9
  self.LLM_model = LLM_eval.init_evaluator(source_lang=source_lang, target_lang=target_lang, domain=domain)
 
10
 
11
  # The function to get the scores
12
  # src: orginal sentence
13
  # mt: machine translation
14
  # ref: reference translation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def get_scores(self, src:str, mt:str, ref:str) -> dict:
16
  comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
17
  bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
18
- llm_score, llm_explanation = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
19
- return {'bleu_score':bleu_score, 'comet_score':comet_score, 'llm_score':llm_score, 'llm_explanation': llm_explanation}
 
20
 
21
  if __name__ == "__main__":
22
  src = "this is an test sentences"
23
  mt = "这是一个测试句子。"
24
  ref = "这不是一个测试语句。"
25
  print(multi_scores().get_scores(src, mt, ref))
 
 
26
 
 
1
  from comet import download_model, load_from_checkpoint
2
  from sacrebleu.metrics import BLEU, CHRF, TER
3
+ # from scores import LLM_eval
4
+ import LLM_eval
5
 
6
  class multi_scores:
7
+ def __init__(self, source_lang="en", target_lang="zh", domain="starcraft 2") -> None:
8
  self.comet_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
9
+ self.bleu_model = BLEU(tokenize=target_lang)
10
  self.LLM_model = LLM_eval.init_evaluator(source_lang=source_lang, target_lang=target_lang, domain=domain)
11
+ # self.score = {}
12
 
13
  # The function to get the scores
14
  # src: orginal sentence
15
  # mt: machine translation
16
  # ref: reference translation
17
+ def calculate_comet_llm(self, src:str, mt:str, ref:str) -> dict:
18
+ comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
19
+ # bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
20
+ llm_acc, llm_completeness = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
21
+ return {'comet_score':comet_score, 'llm_score':llm_acc[0], 'llm_explanation': llm_acc[1]}
22
+ # self.score['bleu_score'] = bleu_score
23
+ # self.score['comet_score'] = comet_score
24
+ # self.score['llm_score'] = llm_score
25
+ # self.score['llm_explanation'] = llm_explanation
26
+
27
+ def calculate_bleu(self, mts:list, refs:list) -> dict:
28
+ # mt and ref are list of sentences
29
+ bleu_score = self.bleu_model.corpus_score(mts, refs).score
30
+ return {'bleu_score':bleu_score}
31
+
32
  def get_scores(self, src:str, mt:str, ref:str) -> dict:
33
  comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
34
  bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
35
+ llm_acc, llm_completeness = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
36
+ return {'bleu_score':bleu_score ,'comet_score':comet_score, 'llm_score':llm_acc[0], 'llm_explanation': llm_acc[1]}
37
+
38
 
39
  if __name__ == "__main__":
40
  src = "this is an test sentences"
41
  mt = "这是一个测试句子。"
42
  ref = "这不是一个测试语句。"
43
  print(multi_scores().get_scores(src, mt, ref))
44
+ # print(multi_scores().calculate_comet_llm(src, mt, ref))
45
+ # print(multi_scores().calculate_bleu([mt], [ref]))
46