Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 6,862 Bytes
8c3427d 696341e 8c3427d 81cb431 8c3427d 81cb431 8c3427d 120684a 8c3427d 696341e 8c3427d 120684a 81cb431 120684a 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 8c3427d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
# %%
import os
import json
from huggingface_hub import Repository
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.figure
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
# import dotenv
# dotenv.load_dotenv()
min_max_scaler = MinMaxScaler()
# %%
def pull_results(results_dir: str):
repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset")
repo.git_pull()
def extract_info_from_result_file(result_file):
"""
{
"config": {
"model_dtype": "float16",
"model_name": "databricks/dbrx-instruct",
"model_sha": "main"
},
"results": {
"hallucination_rate": {
"hallucination_rate": 8.34990059642147
},
"factual_consistency_rate": {
"factual_consistency_rate": 91.65009940357854
},
"answer_rate": {
"answer_rate": 100.0
},
"average_summary_length": {
"average_summary_length": 85.9
}
}
"""
info = json.load(open(result_file, 'r'))
result = {
"LLM": info["config"]["model_name"],
"Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
# "Factual Consistency Rate": info["results"]["factual_consistency_rate"]["factual_consistency_rate"],
"Answer %": info["results"]["answer_rate"]["answer_rate"],
"Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
}
return result
def get_latest_result_file(dir: str):
"""
Get the latest result file in the given directory based on the timestamp in the file name.
"""
if not os.path.isdir(dir):
return None
files = os.listdir(dir)
files = [f for f in files if f.endswith(".json")]
if len(files) == 0:
return None
files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
# print ("Scanning: ", dir, "found latest file: ", files[0])
return os.path.join(dir, files[0])
def scan_and_extract(dir: str):
"""Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
"""
results = []
for root, dirs, files in os.walk(dir):
if len(dirs) == 0:
continue
for dir in dirs:
result_file = get_latest_result_file(os.path.join(root, dir))
if result_file is not None:
results.append(extract_info_from_result_file(result_file))
return results
def load_results(
results_dir: str = "./results",
results_json: str = "./results.json"
):
try:
pull_results(results_dir)
print (f"Successfully pulled results from {results_dir}")
except Exception as e:
print(f"Failed to pull and/or extract latest results: {e}")
try:
results = scan_and_extract(results_dir)
if len(results) > 0:
with open(results_json, "w") as f:
json.dump(results, f, indent=2)
print(f"Successfully scanned and extracted results from {results_dir} and saved to {results_json}")
else:
print(f"No results found in {results_dir}")
except Exception as e:
print(f"Failed to scan and extract results from {results_dir}: {e}")
print(f"Using pre-dumped results from {results_json}")
results = json.load(open(results_json, "r"))
# print(results)
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Hallucination %", ascending=True)
# replace any value TBD with -1
results_df = results_df.replace("TBD", 100)
for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
results_df[column] = results_df[column].apply(lambda x: round(x, 3))
results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
return results_df
# %%
def determine_font_size(LLM: str, hallucination_percent: float) -> int:
# based on both hallucination percent and LLM name, determine font size
# if hallucination percentage is low and LLM name is long, use smaller font size
name_length = len(LLM)
if hallucination_percent < 0.25:
if name_length > 10:
return 8.5
else:
return 9
else:
return 9
def determine_font_color(hallucination_percent: float) -> str:
if 0.25 < hallucination_percent < 0.65:
return 'black'
else:
return 'white'
def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
name_length = len(LLM)
print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
hallu_rate_to_bar_length_ratio = 5
bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
if name_length < bar_length:
return 0.01, determine_font_color(hallucination_percent)
else: # to the right of the bar, black anyway
return hallucination_percent, 'black'
def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
fig = plt.figure(figsize=(8, 4))
# plot using LLM as x-axis and Hallucination % as y-axis
# make bars horizontal
plot_df = df.head(10)
plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))
# plot_df["LLM_x_position"], plot_df["font_color"] = zip(*plot_df.apply(
# lambda row: determine_llm_x_position_and_font_color(row["LLM"], row["Hallucination %"]),
# axis=1
# ))
for i, row in plot_df.iterrows():
plt.text(
# row["LLM_x_position"],
row["Hallucination %"] + 0.025,
row["LLM"],
row["Hallucination %"],
# f"{row['LLM']}",
ha='left',
va='center',
fontsize=9,
# color=row["font_color"]
)
# plt.yticks([])
plt.tight_layout()
# add margin to the right of the plot
plt.subplots_adjust(right=0.95)
plt.xticks(fontsize=9)
plt.xlabel(f"Copyright (2025) Vectara, Inc. Plot generated on: {datetime.now().strftime('%B %d, %Y')}", fontsize=9)
plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().invert_yaxis() # Invert the y-axis to display bars top-down
return fig
# %%
if __name__ == "__main__":
results = scan_and_extract("./results")
with open("./results.json", "w") as f:
json.dump(results, f, indent=2)
# %%
|