llm-qa-bench

Sleeping

File size: 5,135 Bytes

import json
import os
import sys
import pandas as pd
from timeit import default_timer as timer
import nltk

chatting = len(sys.argv) > 1 and sys.argv[1] == "chat"

if chatting:
    os.environ["BATCH_SIZE"] = "1"

from app_modules.init import app_init
from app_modules.llm_qa_chain import QAChain
from app_modules.utils import print_llm_response, calc_metrics, detect_repetition_scores

llm_loader, qa_chain = app_init()

if chatting:
    print("Starting chat mode")
    while True:
        question = input("Please enter your question: ")
        if question.lower() == "exit":
            break
        result = qa_chain.call_chain({"question": question, "chat_history": []}, None)
        print_llm_response(result)

    sys.exit(0)

num_of_questions = 0

if len(sys.argv) > 1:
    num_of_questions = int(sys.argv[1])

# Create an empty DataFrame with column names
df = pd.DataFrame(
    columns=[
        "id",
        "question",
        "answer",
    ]
)

batch_size = int(os.getenv("BATCH_SIZE", "1"))
print(f"Batch size: {batch_size}")

questions_file_path = os.environ.get("QUESTIONS_FILE_PATH")
debug_retrieval = os.getenv("DEBUG_RETRIEVAL", "false").lower() == "true"

# Open the file for reading
print(f"Reading questions from file: {questions_file_path}")
test_data = json.loads(open(questions_file_path).read())

if isinstance(test_data, dict):
    questions = [test_data[key] for key in test_data.keys()]
    ids = [key for key in test_data.keys()]
else:
    questions = test_data
    ids = [row["id"] for row in questions]

if num_of_questions > 0:
    questions = questions[:num_of_questions]

print(f"Number of questions: {len(questions)}")

if __name__ == "__main__":
    chat_start = timer()
    index = 0

    while index < len(questions):
        batch_ids = ids[index : index + batch_size]
        batch_questions = [q["question"] for q in questions[index : index + batch_size]]

        if isinstance(qa_chain, QAChain):
            inputs = [{"question": q, "chat_history": []} for q in batch_questions]
        else:
            inputs = [{"question": q} for q in batch_questions]

        start = timer()
        result = qa_chain.call_chain(inputs, None)
        end = timer()
        print(f"Completed in {end - start:.3f}s")

        # print("result:", result)
        batch_answers = [r["answer"] for r in result]

        for id, question, answer in zip(batch_ids, batch_questions, batch_answers):
            df.loc[len(df)] = {
                "id": id,
                "question": question,
                "answer": answer,
            }

        index += batch_size

        for r in result:
            print_llm_response(r, debug_retrieval)

    chat_end = timer()
    total_time = chat_end - chat_start
    print(f"Total time used: {total_time:.3f} s")

    df2 = pd.DataFrame(
        columns=[
            "id",
            "question",
            "answer",
            "word_count",
            "ground_truth",
        ]
    )

    for i in range(len(df)):
        question = questions[i]
        answer = df["answer"][i]
        query = df["question"][i]
        id = df["id"][i]

        ground_truth = question[
            "wellFormedAnswers" if "wellFormedAnswers" in question else "answers"
        ]

        word_count = len(nltk.word_tokenize(answer))

        df2.loc[len(df2)] = {
            "id": id,
            "question": query,
            "answer": answer,
            "word_count": word_count,
            "ground_truth": ground_truth,
        }

    df2[["newline_score", "repetition_score", "total_repetitions"]] = df2[
        "answer"
    ].apply(detect_repetition_scores)

    pd.options.display.float_format = "{:.3f}".format
    print(df2.describe())

    word_count = df2["word_count"].sum()

    csv_file = (
        os.getenv("TEST_RESULTS_CSV_FILE") or f"qa_batch_{batch_size}_test_results.csv"
    )
    with open(csv_file, "w") as f:
        f.write(
            f"# RAG: {isinstance(qa_chain, QAChain)} questions: {questions_file_path}\n"
        )
        f.write(
            f"# model: {llm_loader.model_name} repetition_penalty: {llm_loader.repetition_penalty}\n"
        )

    df2.to_csv(csv_file, mode="a", index=False, header=True)
    print(f"test results saved to file: {csv_file}")

    scores = calc_metrics(df2)

    df = pd.DataFrame(
        {
            "model": [llm_loader.model_name],
            "repetition_penalty": [llm_loader.repetition_penalty],
            "word_count": [word_count],
            "inference_time": [total_time],
            "inference_speed": [word_count / total_time],
            "bleu1": [scores["bleu_scores"]["bleu"]],
            "rougeL": [scores["rouge_scores"]["rougeL"]],
        }
    )

    print(f"Number of words generated: {word_count}")
    print(f"Average generation speed: {word_count / total_time:.3f} words/s")

    csv_file = os.getenv("ALL_RESULTS_CSV_FILE") or "qa_chain_all_results.csv"
    file_existed = os.path.exists(csv_file) and os.path.getsize(csv_file) > 0
    df.to_csv(csv_file, mode="a", index=False, header=not file_existed)
    print(f"all results appended to file: {csv_file}")