|
from ast import arguments |
|
from typing import Literal, List |
|
import json |
|
|
|
import pandas as pd |
|
|
|
import dotenv |
|
dotenv.load_dotenv() |
|
|
|
from funix import funix, import_theme |
|
from vectara_theme import vectara_theme |
|
import_theme(vectara_theme) |
|
|
|
from app_utils import pull_results, scan_and_extract |
|
|
|
|
|
results = json.load(open("./results.json", "r")) |
|
results_df = pd.DataFrame(results) |
|
|
|
|
|
try: |
|
pull_results("./results") |
|
results = scan_and_extract("./results") |
|
results_df = pd.DataFrame(results) |
|
except Exception as e: |
|
print(f"Failed to pull and/or extract latest results: {e}") |
|
|
|
@funix( |
|
title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard", |
|
direction="column", |
|
autorun=True, |
|
theme="vectara" |
|
) |
|
def leaderboard( |
|
filter_models_by_name: str = "" |
|
) -> pd.DataFrame: |
|
"""# Hughes Hallucination Evaluation Model (HHEM) Leaderboard |
|
|
|
Using [Vectara](https://vectara.com/)'s proprietary [HHEM](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. HHEM's open source version is available [here](https://huggingface.co/vectara/hallucination_evaluation_model). For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard). |
|
|
|
**Work in progress**: For Internal Use Only. |
|
|
|
## Usage |
|
|
|
* All LLMs are displayed by default. To filter, enter the names of the models that you want to see in the "Filter Models by Name" field below, separated by commas or semicolons. |
|
* Results are paginated. To page thru, use the `<` or `>` buttons at the bottom right corner of the table. |
|
* To sort the table, hover over a column header and click the arrow. The arrow automatically points up and down depending on the sort order. |
|
* Click the "Refresh" button to refresh the leaderboard if the table is not shown or does not update when you change the filter. |
|
|
|
|
|
|
|
Args: |
|
filter_models_by_name: filter models by name using comma-separated strings |
|
""" |
|
df = results_df |
|
filter_models_by_name = filter_models_by_name.replace(",", ";") |
|
filter_models_by_name = filter_models_by_name.replace(" ", "") |
|
|
|
|
|
df["Hallucination %"] = df["Hallucination %"].apply(lambda x: round(x, 3)) |
|
|
|
if len(filter_models_by_name) > 0: |
|
filter_models_by_name = filter_models_by_name.split(";") |
|
filter_models_by_name = [name for name in filter_models_by_name if name != ""] |
|
df = df.copy() |
|
df = df[df["LLM"].str.contains("|".join(filter_models_by_name), na=False)] |
|
df = df.sort_values(by="Hallucination %", ascending=True) |
|
return df |
|
|