ans_f1 = 64.63657443342784 correct = 458 eval_loss = -10.802327473958334 incorrect = 59 similar = 247