Spaces:
Sleeping
Sleeping
def is_summary_valid(summary: str) -> bool: | |
""" | |
Checks if the summary is valid. | |
A summary is valid if it is not empty and contains at least five words. | |
Args: | |
summary (str): The summary to check. | |
Returns: | |
bool: True if the summary is valid, False otherwise. | |
""" | |
if isinstance(summary, str): | |
words = summary.split() | |
if len(words) >= 5: | |
return True | |
# print(summary) | |
return False | |
def create_pairs(df): | |
""" | |
Creates pairs of source and summary from the dataframe. | |
Args: | |
df (DataFrame): The dataframe containing source and summary columns. | |
Returns: | |
list: A list of pairs [source, summary]. | |
""" | |
pairs = [] | |
for _, row in df.iterrows(): | |
pairs.append([row['source'], row['summary']]) | |
return pairs | |
# def format_results(model_name: str, revision: str, precision: str, | |
# factual_consistency_rate: float, hallucination_rate: float, | |
# answer_rate: float, avg_summary_len: float) -> dict: | |
# """ | |
# Formats the evaluation results into a structured dictionary. | |
# | |
# Args: | |
# model_name (str): The name of the evaluated model. | |
# revision (str): The revision hash of the model. | |
# precision (str): The precision with which the evaluation was run. | |
# factual_consistency_rate (float): The factual consistency rate. | |
# hallucination_rate (float): The hallucination rate. | |
# answer_rate (float): The answer rate. | |
# avg_summary_len (float): The average summary length. | |
# | |
# Returns: | |
# dict: A dictionary containing the structured evaluation results. | |
# """ | |
# results = { | |
# "config": { | |
# "model_dtype": precision, # Precision with which you ran the evaluation | |
# "model_name": model_name, # Name of the model | |
# "model_sha": revision # Hash of the model | |
# }, | |
# "results": { | |
# "hallucination_rate": { | |
# "hallucination_rate": round(hallucination_rate,3) | |
# }, | |
# "factual_consistency_rate": { | |
# "factual_consistency_rate": round(factual_consistency_rate,1) | |
# }, | |
# "answer_rate": { | |
# "answer_rate": round(answer_rate*100,1) | |
# }, | |
# "average_summary_length": { | |
# "average_summary_length": round(avg_summary_len,1) | |
# }, | |
# } | |
# } | |
# | |
# return results | |
def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict: | |
""" | |
Formats the evaluation results into a structured dictionary. | |
Args: | |
model_name (str): The name of the evaluated model. | |
revision (str): The revision hash of the model. | |
precision (str): The precision with which the evaluation was run. | |
overall_js (float): The overall average JS divergence. | |
overall_ci (tuple): The confidence interval for the overall JS divergence. | |
experiment_scores: Experiment-specific scores and confidence intervals (E1, E1_ci, E2, E2_ci, ...). | |
Returns: | |
dict: A dictionary containing the structured evaluation results. | |
""" | |
# Initialize the base structure | |
results = { | |
"config": { | |
"model_dtype": precision, # Precision with which you ran the evaluation | |
"model_name": model_name, # Name of the model | |
"model_sha": revision # Hash of the model | |
}, | |
"results": { | |
"overall_js_divergence": overall_js, # Overall JS divergence | |
"overall_confidence_interval": overall_ci, # Confidence interval for the overall JS divergence | |
} | |
} | |
# Add experiment-specific results to the dictionary | |
for exp_name, score in experiment_scores.items(): | |
results["results"][exp_name] = score # Add each experiment score and its CI | |
return results | |