Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

File size: 6,862 Bytes

# %%
import os 
import json
from huggingface_hub import Repository
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.figure
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

# import dotenv
# dotenv.load_dotenv()

min_max_scaler = MinMaxScaler()

# %%
def pull_results(results_dir: str):
    repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset") 
    repo.git_pull()

def extract_info_from_result_file(result_file):
    """
        {
        "config": {
            "model_dtype": "float16",
            "model_name": "databricks/dbrx-instruct",
            "model_sha": "main"
        },
        "results": {
            "hallucination_rate": {
            "hallucination_rate": 8.34990059642147
            },
            "factual_consistency_rate": {
            "factual_consistency_rate": 91.65009940357854
            },
            "answer_rate": {
            "answer_rate": 100.0
            },
            "average_summary_length": {
            "average_summary_length": 85.9
            }
        }
    """

    info = json.load(open(result_file, 'r'))
    result = {
        "LLM": info["config"]["model_name"],
        "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
        # "Factual Consistency Rate": info["results"]["factual_consistency_rate"]["factual_consistency_rate"],
        "Answer %": info["results"]["answer_rate"]["answer_rate"],
        "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
    }
    return result

def get_latest_result_file(dir: str):
    """
        Get the latest result file in the given directory based on the timestamp in the file name.
    """
    if not os.path.isdir(dir):
        return None
    files = os.listdir(dir)
    files = [f for f in files if f.endswith(".json")]
    if len(files) == 0:
        return None
    files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
    # print ("Scanning: ", dir, "found latest file: ", files[0])
    return os.path.join(dir, files[0])

def scan_and_extract(dir: str):
    """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
    """

    results = []
    for root, dirs, files in os.walk(dir):
        if len(dirs) == 0:
            continue
        for dir in dirs:
            result_file = get_latest_result_file(os.path.join(root, dir))
            if result_file is not None:
                results.append(extract_info_from_result_file(result_file))
    return results

def load_results(
        results_dir: str = "./results", 
        results_json: str = "./results.json"
        ):
    
    try: 
        pull_results(results_dir)
        print (f"Successfully pulled results from {results_dir}")
    except Exception as e:
        print(f"Failed to pull and/or extract latest results: {e}")
    
    try: 
        results = scan_and_extract(results_dir)
        if len(results) > 0:
            with open(results_json, "w") as f:
                json.dump(results, f, indent=2)
            print(f"Successfully scanned and extracted results from {results_dir} and saved to {results_json}")
        else:
            print(f"No results found in {results_dir}")
    except Exception as e:
        print(f"Failed to scan and extract results from {results_dir}: {e}")
        print(f"Using pre-dumped results from {results_json}")

    results = json.load(open(results_json, "r"))
    # print(results)

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by="Hallucination %", ascending=True)

    # replace any value TBD with -1
    results_df = results_df.replace("TBD", 100)

    for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
        results_df[column] = results_df[column].apply(lambda x: round(x, 3))

    results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
    
    return results_df

# %%
def determine_font_size(LLM: str, hallucination_percent: float) -> int:
    # based on both hallucination percent and LLM name, determine font size
    # if hallucination percentage is low and LLM name is long, use smaller font size
    name_length = len(LLM)
    if hallucination_percent < 0.25:
        if name_length > 10:
            return 8.5
        else:
            return 9
    else:
        return 9
    
def determine_font_color(hallucination_percent: float) -> str:
    if 0.25 < hallucination_percent < 0.65:
        return 'black'
    else:
        return 'white'

def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
    name_length = len(LLM)
    print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)

    hallu_rate_to_bar_length_ratio = 5
    bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
    if name_length < bar_length:
        return 0.01, determine_font_color(hallucination_percent)
    else: # to the right of the bar, black anyway
        return hallucination_percent, 'black'

def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
    fig = plt.figure(figsize=(8, 4))
    # plot using LLM as x-axis and Hallucination % as y-axis
    # make bars horizontal
    plot_df = df.head(10)
    plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])

    plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))

    # plot_df["LLM_x_position"], plot_df["font_color"] = zip(*plot_df.apply(
    #     lambda row: determine_llm_x_position_and_font_color(row["LLM"], row["Hallucination %"]), 
    #     axis=1
    # ))
    
    for i, row in plot_df.iterrows():
        plt.text(
            # row["LLM_x_position"], 
            row["Hallucination %"] + 0.025,
            row["LLM"], 
            row["Hallucination %"],
            # f"{row['LLM']}", 
            ha='left', 
            va='center', 
            fontsize=9,
            # color=row["font_color"]
        )
    # plt.yticks([])
    plt.tight_layout()

    # add margin to the right of the plot
    plt.subplots_adjust(right=0.95)

    plt.xticks(fontsize=9)
    plt.xlabel(f"Copyright (2025) Vectara, Inc. Plot generated on: {datetime.now().strftime('%B %d, %Y')}", fontsize=9)
    plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    plt.gca().invert_yaxis()  # Invert the y-axis to display bars top-down

    return fig

# %%

if __name__ == "__main__":
    results = scan_and_extract("./results")
    with open("./results.json", "w") as f:
        json.dump(results, f, indent=2)

# %%