llm-qa-bench

Running

App Files Files Community

dh-mc commited on May 4

Commit

01f4bd7

•

1 Parent(s): 32a6937

supported flag APPLY_CHAT_TEMPLATE_FOR_RAG

Browse files

Files changed (8) hide show

.env.example +4 -0
app_modules/init.py +2 -10
app_modules/llm_chat_chain.py +2 -37
app_modules/llm_inference.py +38 -0
app_modules/llm_qa_chain.py +12 -4
app_modules/utils.py +62 -216
qa_chain_test.py +12 -2
requirements.txt +3 -1

.env.example CHANGED Viewed

@@ -25,7 +25,11 @@ OPENAI_MODEL_NAME=
 OLLAMA_MODEL_NAME=llama3:8b
 OLLAMA_RP=1.15
 # cpu, mps or cuda:0 - if unset, use whatever detected
 HF_EMBEDDINGS_DEVICE_TYPE=

 OLLAMA_MODEL_NAME=llama3:8b
 OLLAMA_RP=1.15
+HF_RP=1.15
+LANGCHAIN_DEBUG=false
+BATCH_SIZE=1
+APPLY_CHAT_TEMPLATE_FOR_RAG=true
 # cpu, mps or cuda:0 - if unset, use whatever detected
 HF_EMBEDDINGS_DEVICE_TYPE=

app_modules/init.py CHANGED Viewed

@@ -10,7 +10,7 @@ from langchain.vectorstores.chroma import Chroma
 from langchain.vectorstores.faiss import FAISS
 from app_modules.llm_loader import LLMLoader
-from app_modules.utils import get_device_types, init_settings, load_spacy_model
 found_dotenv = find_dotenv(".env")
@@ -53,21 +53,13 @@ def app_init():
     using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
     llm_model_type = os.environ.get("LLM_MODEL_TYPE")
-    debug_metrics = os.getenv("DEBUG_METRICS", "false").lower() == "true"
-    if debug_metrics:
-        start = timer()
-        load_spacy_model()
-        end = timer()
-        print(f"Completed in {end - start:.3f}s")
     qa_with_rag = os.getenv("QA_WITH_RAG", "true").lower() == "true"
     print(f"qa_with_rag: {qa_with_rag}")
     retrieve_from_questions_file = os.getenv("RETRIEVER_TYPE") == "questions_file"
     print(f"retrieve_from_questions_file: {retrieve_from_questions_file}", flush=True)
-    if qa_with_rag and not retrieve_from_questions_file or debug_metrics:
         print(f"hf_embeddings_model_name: {hf_embeddings_model_name}")
         start = timer()
         embeddings = HuggingFaceInstructEmbeddings(

 from langchain.vectorstores.faiss import FAISS
 from app_modules.llm_loader import LLMLoader
+from app_modules.utils import get_device_types, init_settings
 found_dotenv = find_dotenv(".env")
     using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
     llm_model_type = os.environ.get("LLM_MODEL_TYPE")
     qa_with_rag = os.getenv("QA_WITH_RAG", "true").lower() == "true"
     print(f"qa_with_rag: {qa_with_rag}")
     retrieve_from_questions_file = os.getenv("RETRIEVER_TYPE") == "questions_file"
     print(f"retrieve_from_questions_file: {retrieve_from_questions_file}", flush=True)
+    if qa_with_rag and not retrieve_from_questions_file:
         print(f"hf_embeddings_model_name: {hf_embeddings_model_name}")
         start = timer()
         embeddings = HuggingFaceInstructEmbeddings(

app_modules/llm_chat_chain.py CHANGED Viewed

@@ -6,7 +6,7 @@ from langchain.chains import ConversationChain, LLMChain
 from langchain.prompts import PromptTemplate
 from langchain.chains.base import Chain
-from app_modules.llm_inference import LLMInference
 from app_modules.utils import CustomizedConversationSummaryBufferMemory
 from langchain.chains import LLMChain
 from langchain.globals import get_debug
@@ -15,23 +15,6 @@ chat_history_enabled = os.getenv("CHAT_HISTORY_ENABLED", "false").lower() == "tr
 B_INST, E_INST = "[INST]", "[/INST]"
-def get_system_prompt_and_user_message(orca=False):
-    # system_prompt = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
-    system_prompt = (
-        "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
-        if orca
-        else "You are a chatbot having a conversation with a human."
-    )
-    user_message = "{input}"
-    if chat_history_enabled:
-        user_message = "Chat History:\n\n{history} \n\n" + user_message
-        system_prompt += " Read the chat history to get context."
-    return system_prompt, user_message
 def create_llama_2_prompt_template():
     B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
@@ -141,19 +124,7 @@ class ChatChain(LLMInference):
         if not isinstance(inputs, list):
             inputs = {"input": inputs["question"]}
         elif self.llm_loader.llm_model_type == "huggingface":
-            inputs = [
-                [
-                    {
-                        "role": "system",
-                        "content": self.get_system_message(i),
-                    },
-                    {
-                        "role": "user",
-                        "content": self.get_user_message(i),
-                    },
-                ]
-                for i in inputs
-            ]
         else:
             inputs = [{"input": i["question"]} for i in inputs]
@@ -161,9 +132,3 @@ class ChatChain(LLMInference):
             print("_process_inputs:", json.dumps(inputs, indent=4))
         return inputs
-    def get_system_message(self, input) -> Chain:
-        return get_system_prompt_and_user_message()[0]
-    def get_user_message(self, input) -> Chain:
-        return input["question"]

 from langchain.prompts import PromptTemplate
 from langchain.chains.base import Chain
+from app_modules.llm_inference import LLMInference, get_system_prompt_and_user_message
 from app_modules.utils import CustomizedConversationSummaryBufferMemory
 from langchain.chains import LLMChain
 from langchain.globals import get_debug
 B_INST, E_INST = "[INST]", "[/INST]"
 def create_llama_2_prompt_template():
     B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
         if not isinstance(inputs, list):
             inputs = {"input": inputs["question"]}
         elif self.llm_loader.llm_model_type == "huggingface":
+            inputs = [self.apply_chat_template(input["question"]) for input in inputs]
         else:
             inputs = [{"input": i["question"]} for i in inputs]
             print("_process_inputs:", json.dumps(inputs, indent=4))
         return inputs

app_modules/llm_inference.py CHANGED Viewed

@@ -14,6 +14,25 @@ from langchain.chains.base import Chain
 from app_modules.llm_loader import LLMLoader, TextIteratorStreamer
 from app_modules.utils import remove_extra_spaces
 class LLMInference(metaclass=abc.ABCMeta):
     def __init__(self, llm_loader):
@@ -143,3 +162,22 @@ class LLMInference(metaclass=abc.ABCMeta):
         t.join()
         return que.get()

 from app_modules.llm_loader import LLMLoader, TextIteratorStreamer
 from app_modules.utils import remove_extra_spaces
+chat_history_enabled = os.getenv("CHAT_HISTORY_ENABLED", "false").lower() == "true"
+def get_system_prompt_and_user_message(orca=False):
+    # system_prompt = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
+    system_prompt = (
+        "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
+        if orca
+        else "You are a chatbot having a conversation with a human."
+    )
+    user_message = "{input}"
+    if chat_history_enabled:
+        user_message = "Chat History:\n\n{history} \n\n" + user_message
+        system_prompt += " Read the chat history to get context."
+    return system_prompt, user_message
 class LLMInference(metaclass=abc.ABCMeta):
     def __init__(self, llm_loader):
         t.join()
         return que.get()
+    def apply_chat_template(self, user_message):
+        result = (
+            []
+            if self.llm_loader.model_name.lower().startswith("gemma")
+            else [
+                {
+                    "role": "system",
+                    "content": get_system_prompt_and_user_message()[0],
+                }
+            ]
+        )
+        result.append(
+            {
+                "role": "user",
+                "content": user_message,
+            }
+        )
+        return result

app_modules/llm_qa_chain.py CHANGED Viewed

@@ -6,12 +6,17 @@ from langchain.chains import ConversationalRetrievalChain
 from langchain.chains.base import Chain
 from app_modules.llm_inference import LLMInference
 from app_modules.utils import CustomizedConversationSummaryBufferMemory
 from langchain_core.retrievers import BaseRetriever
 from langchain_core.documents import Document
 from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
 from langchain.globals import get_debug
 retrieve_from_questions_file = os.getenv("RETRIEVER_TYPE") == "questions_file"
 if retrieve_from_questions_file:
     questions_file_path = os.getenv("QUESTIONS_FILE_PATH")
@@ -108,8 +113,11 @@ class QAChain(LLMInference):
         # find the query in the df
         filtered = df[df["question"].str.lower() == query.lower()]
-        context = filtered.iloc[0]["context"]
-        return (
-            f"{qa_system_prompt}\n\n{context}\n\nQuestion: {query}\n\nHelpful Answer:"
-        )

 from langchain.chains.base import Chain
 from app_modules.llm_inference import LLMInference
 from app_modules.utils import CustomizedConversationSummaryBufferMemory
 from langchain_core.retrievers import BaseRetriever
 from langchain_core.documents import Document
 from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
 from langchain.globals import get_debug
 retrieve_from_questions_file = os.getenv("RETRIEVER_TYPE") == "questions_file"
+apply_chat_template_for_rag = os.getenv("APPLY_CHAT_TEMPLATE_FOR_RAG") == "true"
+print(f"retrieve_from_questions_file: {retrieve_from_questions_file}", flush=True)
+print(f"apply_chat_template_for_rag: {apply_chat_template_for_rag}", flush=True)
 if retrieve_from_questions_file:
     questions_file_path = os.getenv("QUESTIONS_FILE_PATH")
         # find the query in the df
         filtered = df[df["question"].str.lower() == query.lower()]
+        context = filtered.iloc[0]["context"] if len(filtered) > 0 else ""
+        if apply_chat_template_for_rag:
+            return self.apply_chat_template(
+                f"{qa_system_prompt}\n\n{context}\n\nQuestion: {query}"
+            )
+        else:
+            return f"{qa_system_prompt}\n\n{context}\n\nQuestion: {query}\n\nHelpful Answer:"

app_modules/utils.py CHANGED Viewed

@@ -7,7 +7,8 @@ import os
 import platform
 import re
 from pathlib import Path
 import requests
 import torch
 from tqdm import tqdm
@@ -186,234 +187,79 @@ class CustomizedConversationSummaryBufferMemory(ConversationSummaryBufferMemory)
         )
-def CalculateDistance(entry1, entry2, distance_calculator):
-    if entry1 == entry2:
-        return 0
-    distance = distance_calculator.evaluate_string_pairs(
-        prediction=entry1, prediction_b=entry2
-    )
-    # print(f"entry1: {entry1}, entry2: {entry2}, distance: {distance['score']}")
-    return distance["score"]
-def FindInList(entry, elist, distance_calculator=None, debug=False):
-    for item in elist:
-        if distance_calculator is not None:
-            distance = CalculateDistance(entry, item, distance_calculator)
-            if distance < distance_threshold:
-                if debug:
-                    print(
-                        f"FindInList - matched by distance {distance:.3f}: {entry} - {item}"
-                    )
-                return True
-        if entry == item:
-            return True
-    return False
-def CalculatePRF1F2(
-    goldAnswerList, predAnswerList, distance_calculator=None, debug=False
-):
-    if len(goldAnswerList) == 0:
-        if len(predAnswerList) == 0:
-            return [
-                1.0,
-                1.0,
-                1.0,
-                1.0,
-            ]  # consider it 'correct' when there is no labeled answer, and also no predicted answer
-        else:
-            return [
-                0.0,
-                1.0,
-                0.0,
-                0.0,
-            ]  # precision=0 and recall=1 when there is no labeled answer, but has some predicted answer(s)
-    elif len(predAnswerList) == 0:
-        return [
-            1.0,
-            0.0,
-            0.0,
-            0.0,
-        ]  # precision=1 and recall=0 when there is labeled answer(s), but no predicted answer
-    else:
-        glist = goldAnswerList
-        plist = predAnswerList
-        tp = 1e-40  # numerical trick
-        fp = 0.0
-        fn = 0.0
-        for gentry in glist:
-            if FindInList(
-                gentry, plist, distance_calculator=distance_calculator, debug=True
-            ):
-                tp += 1
-            else:
-                fn += 1
-        for pentry in plist:
-            if not FindInList(pentry, glist, distance_calculator=distance_calculator):
-                fp += 1
-        precision = tp / (tp + fp)
-        recall = tp / (tp + fn)
-        f1 = (2 * precision * recall) / (precision + recall)
-        f2 = (5 * precision * recall) / (4 * precision + recall)
-        return [precision, recall, f1, f2]
-nlp = None
-distance_threshold = 0.05
-def load_spacy_model():
-    import spacy
-    global nlp
-    if nlp is not None:
-        return nlp
-    global distance_threshold
-    distance_threshold = float(os.getenv("DISTANCE_THRESHOLD", "0.05"))
-    spacy_model_name = os.getenv("SPACY_MODEL_NAME", "en_core_web_trf")
-    while True:
-        try:
-            print(f"loading spacy model from {spacy_model_name}")
-            nlp = spacy.load(spacy_model_name)
-            print(f"loaded spacy model from {spacy_model_name}")
-            return nlp
-        except OSError:
-            print(f"downloading spacy model {spacy_model_name}")
-            spacy.cli.download(spacy_model_name)
-            print(f"downloaded spacy model {spacy_model_name}")
-def clean_text(text):
-    text = text.lower()
-    text = text.replace('"', "")
-    text = text.replace(".", "")
-    # text = text.replace("ō", "o")
-    return text
-def get_entities_in_text(text, debug=False):
-    nlp = load_spacy_model()
-    doc = nlp(text)
-    entities_in_text = []
-    for word in doc.ents:
-        if debug:
-            print(word.text, word.label_)
-        entity = clean_text(word.text)
-        if entity not in entities_in_text:
-            entities_in_text.append(entity)
-    entities_in_text.sort()
-    return entities_in_text
-def calculate_metrics(question, answer, distance_calculator=None, debug=False):
-    ground_truth = question["answers"]
-    ground_truth.sort()
     if debug:
-        print(f"question: {question}")
-        print(f"answer: {answer}")
-        print("entities_in_question ---------------")
-        entities_in_question = get_entities_in_text(question["question"], debug)
-        print("entities_in_answer -----------------")
-        entities_in_answer = get_entities_in_text(answer, debug)
-        print("done with NER with spaCy -----------")
-        entities_in_answer.sort()
-        predAnswerList = [
-            pentry
-            for pentry in entities_in_answer
-            if not FindInList(pentry, entities_in_question)
-        ]
-        print(f"entities_in_question: {entities_in_question}")
-        print(f"entities_in_answer: {entities_in_answer}")
-        print(f"ground_truth: {ground_truth}")
-        print(f"pred_answers: {predAnswerList}")
-        precision, recall, f1, f2 = CalculatePRF1F2(
-            ground_truth,
-            predAnswerList,
-            debug=debug,
-            distance_calculator=distance_calculator,
-        )
-        print(f"precision: {precision}, recall: {recall}, f1: {f1}, f2: {f2}")
-    else:
-        precision = 0.0
-        recall = 0.0
-        f1 = 0.0
-        f2 = 0.0
-        entities_in_answer = []
-        entities_in_question = []
-    return (
-        precision,
-        recall,
-        f1,
-        f2,
-        entities_in_answer,
-        ground_truth,
-        entities_in_question,
     )
-def calculate_metrics_gemini(question, answer, debug=False):
-    precision = 0.0
-    recall = 0.0
-    f1 = 0.0
-    return (precision, recall, f1)
-if __name__ == "__main__":
-    from langchain_community.embeddings import HuggingFaceInstructEmbeddings
-    from langchain.evaluation import load_evaluator
-    hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
-    print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
-    print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
-    hf_embeddings_model_name = "hkunlp/instructor-large"
-    print(f"hf_embeddings_model_name: {hf_embeddings_model_name}")
-    embeddings = HuggingFaceInstructEmbeddings(
-        model_name=hf_embeddings_model_name,
-        model_kwargs={"device": hf_embeddings_device_type},
-    )
-    hf_evaluator = load_evaluator("pairwise_embedding_distance", embeddings=embeddings)
-    question = {
-        "question": "what does jamaican people speak",
-        "entities_in_question": ["jamaican"],
-        "answers": ["jamaican english", "jamaican creole english language"],
-    }
-    answer = "Jamaican people primarily speak Jamaican Patois, which is an English-based creole language with significant West African influences. It is spoken as a native language by the majority of Jamaicans and also exists in various forms among Jamaican expatriates and non-Jamaicans in different parts of the world. The phonology of Jamaican Patois includes around 21 consonants (with some dialectal variation regarding the status of /h/ as a phoneme) and between nine and sixteen vowels, some of which are capable of nasalization or lengthening. There are also instances of palatalization in Jamaican Patois, where certain consonants appear to be phonemic in some dialects but may be considered phonetic in others. For example, the palatal stops [c], [ɟ], and [ɲ] may be analyzed as phonemes or as instances of phonetic palatalization depending on the account."
-    calculate_metrics(question, answer, distance_calculator=hf_evaluator, debug=True)
-    question = {
-        "question": "who is governor of ohio 2011",
-        "entities_in_question": ["2011"],
-        "answers": ["john kasich", "return j. meigs, jr.", "ted strickland"],
-    }
-    answer = "The lieutenant governor of Ohio in 2011 was Mary Taylor, who served alongside Governor John Kasich. She assumed office on January 10, 2011, after being elected as the lieutenant governor in the 2010 election. During her tenure, she faced criticism for using the state airplane for personal errands and reportedly had high turnover among her staff."
-    calculate_metrics(question, answer, distance_calculator=hf_evaluator, debug=True)
-    question = {
-        "question": "where is the fukushima daiichi nuclear power station",
-        "entities_in_question": ["the fukushima daiichi nuclear power station"],
-        "answers": ["japan", "okuma"],
-    }
-    answer = "The Fukushima Daiichi Nuclear Power Station is located in the towns of Ōkuma and Futaba in Fukushima Prefecture, Japan."
-    calculate_metrics(question, answer, distance_calculator=hf_evaluator, debug=True)

 import platform
 import re
 from pathlib import Path
+import evaluate
+import pandas as pd
 import requests
 import torch
 from tqdm import tqdm
         )
+bleu = evaluate.load("bleu")
+rouge = evaluate.load("rouge")
+def calc_metrics(df):
+    predictions = [df["answer"][i] for i in range(len(df))]
+    references = [df["ground_truth"][i] for i in range(len(df))]
+    bleu_scores = bleu.compute(
+        predictions=predictions, references=references, max_order=1
+    )
+    rouge_scores = rouge.compute(predictions=predictions, references=references)
+    return {"bleu_scores": bleu_scores, "rouge_scores": rouge_scores}
+pattern_abnormal_newlines = re.compile(r"\n{5,}")
+pattern_text_repetitions = re.compile(r"\b(\w.+?)\b(\1+)", re.M | re.DOTALL)
+exception_pattern = re.compile(r"(\w+\.)\1")
+# final version for repetition detection
+def detect_repetitions(
+    text, debug=False, pattern_text_repetitions=pattern_text_repetitions
+):
+    subtotals = [0, 0]
+    if isinstance(text, str):
+        patterns = [pattern_abnormal_newlines, pattern_text_repetitions]
+        for i, pattern in enumerate(patterns):
+            if debug:
+                print(
+                    f"----detect {'abnormal newlines' if i == 0 else 'text repetitions'}----"
+                )
+            matches = pattern.finditer(text)
+            for match in matches:
+                if debug:
+                    print(match)
+                    for groupNum in range(0, len(match.groups())):
+                        groupNum = groupNum + 1
+                        print(
+                            "Group {groupNum} found at {start}-{end}: `{group}`".format(
+                                groupNum=groupNum,
+                                start=match.start(groupNum),
+                                end=match.end(groupNum),
+                                group=match.group(groupNum),
+                            )
+                        )
+                if exception_pattern.match(match[0]):
+                    if debug:
+                        print("ignored: ", match[0])
+                    continue
+                start, end = match.span()
+                subtotals[i] += end - start
+    result = (subtotals[0], subtotals[1], subtotals[0] + subtotals[1])
     if debug:
+        print(result)
+    return result
+def detect_abnormal_newlines(text, debug=False):
+    return detect_repetitions(text, debug=debug)[0]
+def detect_text_repetitions(text, debug=False):
+    return detect_repetitions(text, debug=debug)[1]
+def detect_repetition_scores(text, debug=False):
+    newline_score, repetition_score, total_repetitions = detect_repetitions(
+        text, debug=debug
     )
+    return pd.Series([newline_score, repetition_score, total_repetitions])

qa_chain_test.py CHANGED Viewed

@@ -12,7 +12,7 @@ if chatting:
 from app_modules.init import app_init
 from app_modules.llm_qa_chain import QAChain
-from app_modules.utils import print_llm_response
 llm_loader, qa_chain = app_init()
@@ -116,7 +116,9 @@ if __name__ == "__main__":
         query = df["question"][i]
         id = df["id"][i]
-        ground_truth = question["answers"]
         word_count = len(nltk.word_tokenize(answer))
@@ -128,6 +130,10 @@ if __name__ == "__main__":
             "ground_truth": ground_truth,
         }
     pd.options.display.float_format = "{:.3f}".format
     print(df2.describe())
@@ -147,6 +153,8 @@ if __name__ == "__main__":
     df2.to_csv(csv_file, mode="a", index=False, header=True)
     print(f"test results saved to file: {csv_file}")
     df = pd.DataFrame(
         {
             "model": [llm_loader.model_name],
@@ -154,6 +162,8 @@ if __name__ == "__main__":
             "word_count": [word_count],
             "inference_time": [total_time],
             "inference_speed": [word_count / total_time],
         }
     )

 from app_modules.init import app_init
 from app_modules.llm_qa_chain import QAChain
+from app_modules.utils import print_llm_response, calc_metrics, detect_repetition_scores
 llm_loader, qa_chain = app_init()
         query = df["question"][i]
         id = df["id"][i]
+        ground_truth = question[
+            "wellFormedAnswers" if "wellFormedAnswers" in question else "answers"
+        ]
         word_count = len(nltk.word_tokenize(answer))
             "ground_truth": ground_truth,
         }
+    df2[["newline_score", "repetition_score", "total_repetitions"]] = df2[
+        "answer"
+    ].apply(detect_repetition_scores)
     pd.options.display.float_format = "{:.3f}".format
     print(df2.describe())
     df2.to_csv(csv_file, mode="a", index=False, header=True)
     print(f"test results saved to file: {csv_file}")
+    scores = calc_metrics(df2)
     df = pd.DataFrame(
         {
             "model": [llm_loader.model_name],
             "word_count": [word_count],
             "inference_time": [total_time],
             "inference_speed": [word_count / total_time],
+            "bleu1": [scores["bleu_scores"]["bleu"]],
+            "rougeL": [scores["rouge_scores"]["rougeL"]],
         }
     )

requirements.txt CHANGED Viewed

@@ -9,4 +9,6 @@ gradio==4.26.0
 spaces==0.27.1
 black==24.4.0
 chardet==5.2.0
-sentencepiece==0.2.0

 spaces==0.27.1
 black==24.4.0
 chardet==5.2.0
+sentencepiece==0.2.0
+evaluate==0.4.2
+rouge_score==0.1.2