ans_f1 = 72.25775338812923 correct = 439 eval_loss = -11.41357421875 incorrect = 54 similar = 271