JiaenLiu commited on
Commit
b0198e8
1 Parent(s): 61ca873

Evaluation structure

Browse files

Former-commit-id: 956e1085199af0c47b3fc01f395f1c3be195ece4

evaluation/alignment.py ADDED
File without changes
evaluation/evaluation.py ADDED
File without changes
evaluation/readme.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Evaluation:
2
+ BLEU (https://github.com/mjpost/sacrebleu)
3
+ COMET (https://github.com/Unbabel/COMET)
4
+ LLM eval
5
+ Eval time stamp
6
+
7
+ Sep 18 - Sep 25
8
+ Proj-t
9
+ src
10
+ evaluation
11
+ - scores
12
+ - LLM_eval.py (jiaen)
13
+ - scores.py (wizard)
14
+ - comet
15
+ - sacrebleu
16
+ - alignment.py (david)
17
+ - evaluation.py (not assigned)
18
+ - results
19
+ - mmddyy-HMS-results.csv
20
+ - logs
21
+
22
+ entry:
23
+ Python3 evaluation/evaluation.py –pred path/to/pred –gt path/to/gt
24
+
evaluation/scores/LLM_eval.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This script is used to evaluate the performance of Pigeon AI Video Translation system by using Large Language Model.
2
+
3
+ # Written by Jiaen LIU, 2023/09/18
4
+
5
+ # Import the necessary packages
6
+ from langchain.evaluation import load_evaluator, EvaluatorType
7
+ from langchain.prompts import PromptTemplate
8
+ from langchain.chat_models import ChatAnthropic
9
+
10
+ import re
11
+ from tqdm import tqdm
12
+ import pandas as pd
13
+ import numpy as np
14
+ import time
15
+ import os
16
+ import argparse
17
+ from pathlib import Path
18
+ from src.srt_util.srt import SrtScript
19
+
20
+ class PiegonLLMEvaluator():
21
+ """
22
+ input :
23
+ - predicted sentences
24
+ - ground truth sentences
25
+ output :
26
+ - scores
27
+ - explanations
28
+ """
29
+
30
+ def __init__(self, output_dir: str, data_dir: str ) -> None:
31
+ # self.__eval_chain = self.__initialize_QAEvalChain()
32
+ self.__data_dir = data_dir
33
+ self.__result_dir = output_dir
34
+ # self.__chatbot = evaluater
35
+ self.__result_df = pd.DataFrame()
36
+ self.__initialize_df()
37
+ self.running_cost = 0
38
+ self.last_cost = 0
39
+ pass
40
+
41
+ def __init_llm_evaluater(self):
42
+ # llm = ChatOpenAI(temperature=0, model="gpt-4-0613")
43
+ # search = SerpAPIWrapper()
44
+ # tools = [
45
+ # Tool(
46
+ # name="Search",
47
+ # func=search.run,
48
+ # coroutine=search.arun,
49
+ # description="Useful when you need to answer questions about current events. You should ask targeted questions.",
50
+ # ),
51
+ # ]
52
+ # agents = [
53
+ # initialize_agent(tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False),
54
+ # initialize_agent(tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False)
55
+ # ]
56
+
57
+ llm = ChatAnthropic(temperature=0)
58
+
59
+ fstring = """You are an expert English to Chinese translator specialized in Startcraft2.
60
+ You are grading the following question:
61
+ {query}
62
+ Here is the real answer:
63
+ {answer}
64
+ You are grading the following predicted answer:
65
+ {result}
66
+ Give two grades, one for completness and another for accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low completeness/accuracy) and 100 is the highest (very high completness/accuracy)?
67
+ Do not base the two scores off each other give them the scores independently. Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
68
+ numerically incorrect this also includes values that have the $ in front
69
+ Please give the completeness score first followed by the accuracy score.
70
+ For example: Completeness: 70. Accuracy: 40. Explanation here
71
+ Do not differ from the format ever
72
+ """
73
+ prompt = PromptTemplate.from_template(fstring)
74
+
75
+ self.__llm_evaluator = load_evaluator("criteria", llm=llm, criteria="conciseness",prompt=prompt)
76
+
evaluation/scores/scores.py ADDED
File without changes