test_cases = [
    {
        "predictions": ["w ɛ ɹ WORD_BOUNDARY ɪ z WORD_BOUNDARY ð ɪ s WORD_BOUNDARY", "l ɪ ɾ əl WORD_BOUNDARY aɪ z WORD_BOUNDARY"],
        "references": ["w ɛ ɹ WORD_BOUNDARY ɪ z WORD_BOUNDARY ð ɪ s WORD_BOUNDARY", "l ɪ ɾ əl WORD_BOUNDARY aɪ z WORD_BOUNDARY"],
        "result": {'type_fscore': 1.0, 'type_precision': 1.0, 'type_recall': 1.0, 'token_fscore': 1.0, 'token_precision': 1.0, 'token_recall': 1.0, 'boundary_all_fscore': 1.0, 'boundary_all_precision': 1.0, 'boundary_all_recall': 1.0, 'boundary_noedge_fscore': 1.0, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 1.0}
    },
    {
        "predictions": ["t h e d o g WORD_BOUNDARY i s WORD_BOUNDARY i n WORD_BOUNDARY t h e WORD_BOUNDARY b o a t WORD_BOUNDARY"],
        "references": ["t h e WORD_BOUNDARY d o g WORD_BOUNDARY i s WORD_BOUNDARY i n WORD_BOUNDARY t h e WORD_BOUNDARY b o a t WORD_BOUNDARY"],
        "result": {'type_fscore': 0.8, 'type_precision': 0.8, 'type_recall': 0.8, 'token_fscore': 0.73, 'token_precision': 0.8, 'token_recall': 0.67, 'boundary_all_fscore': 1.0, 'boundary_all_precision': 1.0, 'boundary_all_recall': 0.94, 'boundary_noedge_fscore': 0.89, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 0.8}
    }
]