rag-bench-evaluation / scripts /get_factual_evaluation.py
gourisankar85's picture
Upload 4 files
58f58f3 verified
import os
import json
import logging
from scripts.get_prediction_result import get_prediction_result
from scripts.helper import ensure_directory_exists, load_dataset
# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Improved function to evaluate noise robustness
def get_factual_evaluation(config):
result_path = config['result_path'] + 'Counterfactual Robustness/'
noise_rate = config['noise_rate']
# Iterate over each model specified in the config
filename = os.path.join(result_path, f"prediction_{config['output_file_extension']}.json")
ensure_directory_exists(filename)
results = get_prediction_result(config, config['factual_file_name'], filename) # Store results for this model
# Save results to a file
with open(filename, 'w', encoding='utf-8') as f:
for result in results:
f.write(json.dumps(result, ensure_ascii=False) + '\n')
# Compute per-model noise robustness
tt = sum(1 for i in results if (noise_rate == 1 and i['label'][0] == -1) or (0 not in i['label'] and 1 in i['label']))
scores = {
'all_rate': (tt)/len(results),
'noise_rate': noise_rate,
'tt':tt,
'nums': len(results),
}
fact_tt = 0
correct_tt = 0
for i in results:
if i['factlabel'] == 1:
fact_tt += 1
if 0 not in i['label']:
correct_tt += 1
fact_check_rate = fact_tt/len(results)
if fact_tt > 0:
correct_rate = correct_tt/fact_tt
else:
correct_rate = 0
scores['fact_check_rate'] = fact_check_rate
scores['correct_rate'] = correct_rate
scores['fact_tt'] = fact_tt
scores['correct_tt'] = correct_tt
#logging.info(f"score: {scores}")
score_filename = os.path.join(result_path, f"scores_{config['output_file_extension']}.json")
with open(score_filename, 'w') as f:
json.dump(scores, f, ensure_ascii=False, indent=4)
return filename