de-identification-leaderboard / utils /Evaluation_answer_txt.py
zhaorui-nb's picture
no change
2f22782
raw
history blame
No virus
7.03 kB
import re
import os
from collections import Counter
import json
class Tag:
def __init__(self, txt_line:str):
# | file_name | label_type | label_start | label_end | label_text |
# match = re.match(r'(.+)\t(\w+)\t(\d+)\t(\d+)\t(.+)', txt_line)
try:
sep = txt_line.strip().split('\t')
self.file_id = sep[0]
self.type = sep[1]
self.start = sep[2] # int(sep[2])
self.end = sep[3] # int(sep[3])
self.text = sep[4]
except:
raise ValueError('The format of the input line is not correct. Please check the input line format.')
def get_type(self):
return self.type
def get_file_id(self):
return self.file_id
def __eq__(self, other: 'Tag'):
# if all file_id, type, start, end, are the same, return True
# text is not considered for the comparison
ck_file_id = self.file_id == other.file_id
ck_type = self.type == other.type
ck_start = self.start == other.start
ck_end = self.end == other.end
# ck_text = self.text == other.text
if ck_file_id and ck_type and ck_start and ck_end:
return True
else:
return False
def __repr__(self):
return f'<{self.__class__.__name__} {self.file_id:10} {self.type:10} s:{self.start:5} e:{self.end:5} {self.text}>\n'
def __hash__(self):
return hash((self.file_id, self.type, self.start, self.end))
class Evaluation_answer_txt:
def __init__(self, gold_answer, pred_answer):
self.gold_answer = gold_answer
self.pred_answer = pred_answer
self.gold_set = set() # set of Tag
self.pred_set = set() # set of Tag
self.type_set = set() # set of label type str
self.gold_label_counter = Counter() # Counter of gold label type
self.resault_score = {}
def _lines_to_tag_set(self, lines, set_type): # set_type: 'gold' or 'pred'
tags = []
for i in range(len(lines)):
try:
tag = Tag(lines[i])
tags.append(tag)
except:
print(f'Error at {set_type} answer line: {i+1}, {lines[i]}')
return set(tags)
def _set_filter(self, tag_set, type):
# tag set filter by type
return {tag for tag in tag_set if tag.get_type() == type}
def _division(self, a, b):
try:
return a / b
except:
return 0.0
def _f1_score(self, TP=None, FP=None, FN=None):
if TP is None or FP is None or FN is None:
raise ValueError('TP, FP, FN should be given.')
precision = self._division(TP, TP + FP)
recall = self._division(TP, TP + FN)
f1 = self._division(2 * precision * recall, precision + recall)
return {'precision': precision, 'recall': recall, 'f1': f1}
def eval(self, ignore_no_gold_tag_file=True):
with open(self.gold_answer, 'r') as f:
gold_line = f.readlines()
# with open(self.pred_answer, 'r') as f:
# pred_line = f.readlines()
########## add to support the input is a file object ##########
if isinstance(self.pred_answer, str):
with open(self.pred_answer, 'r') as f:
pred_line = f.readlines()
else:
pred_line = self.pred_answer.readlines()
#pred_line is bytes, need to decode
pred_line = [line.decode('utf-8') for line in pred_line]
self.gold_set = self._lines_to_tag_set(gold_line, 'gold')
self.pred_set = self._lines_to_tag_set(pred_line, 'pred')
# in islab aicup program, it will ignore the files that have no gold tags
# that program only consider the files that write in gold answer.txt
if ignore_no_gold_tag_file:
# filter the files that have no gold tags
gold_files = {tag.get_file_id() for tag in self.gold_set}
self.pred_set = {tag for tag in self.pred_set if tag.get_file_id() in gold_files}
# statistics tags and types
for tag in self.gold_set:
self.type_set.add(tag.get_type())
self.gold_label_counter[tag.get_type()] += 1
for tag in self.pred_set:
self.type_set.add(tag.get_type())
TP_set = self.gold_set & self.pred_set
FP_set = self.pred_set - self.gold_set
FN_set = self.gold_set - self.pred_set
# count each type of label
for label in self.type_set:
filter_TP = self._set_filter(TP_set, label)
filter_FP = self._set_filter(FP_set, label)
filter_FN = self._set_filter(FN_set, label)
score = self._f1_score(len(filter_TP), len(filter_FP), len(filter_FN))
self.resault_score[label] = score
# MICRO_AVERAGE
self.resault_score['MICRO_AVERAGE'] = self._f1_score(len(TP_set), len(FP_set), len(FN_set))
# MACRO_AVERAGE
precision_sum = 0
recall_sum = 0
# f1_sum = 0 # at aicup, calc by MACRO_AVERAGE precision and recall
for label in self.type_set:
precision_sum += self.resault_score[label]['precision']
recall_sum += self.resault_score[label]['recall']
# f1_sum += self.resault_score[label]['f1']
precision = self._division(precision_sum, len(self.type_set))
recall = self._division(recall_sum, len(self.type_set))
# f1 = 2 * precision * recall / (precision + recall)
f1 = self._division(2 * precision * recall , (precision + recall))
self.resault_score['MACRO_AVERAGE'] = {'precision': precision, 'recall': recall, 'f1': f1}
# add Support to each type of label
for label in self.type_set:
self.resault_score[label]['support'] = self.gold_label_counter[label]
self.resault_score['MICRO_AVERAGE']['support'] = len(self.gold_set)
self.resault_score['MACRO_AVERAGE']['support'] = len(self.gold_set)
# return json.dumps(self.resault_score, indent=4)
return self.resault_score
if __name__=="__main__":
# with open('.output/[meta-llama@Llama-2-7b-hf][Setting3][icl]answer.txt', 'r', encoding='utf-8') as f:
# lines = [line.strip() for line in f.readlines() if line.strip() != '']
# gold_path = 'dataset/Setting3_test_answer.txt'
# pred_path = '.output/EleutherAI-pythia-1b-Setting3_answer.txt'
# gold_path = './.output/test_eval/gold_answer.txt'
# pred_path = './.output/test_eval/pred_answer.txt'
gold_path = 'dataset/Setting3_test_answer.txt'
pred_path = '.output/[meta-llama@Llama-2-7b-hf][Setting3][icl]answer.txt'
eval = Evaluation_answer_txt(gold_path, pred_path)
res = eval.eval()
print(res)