diff --git "a/all_results.json" "b/all_results.json" --- "a/all_results.json" +++ "b/all_results.json" @@ -98864,9 +98864,480 @@ "AC3_6": 0.20719387750245932, "AC3_7": 0.18474358969695973 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_3": { + "overall_acc": 0.2638095238095238, + "language_acc": { + "English": 0.26, + "Vietnamese": 0.22666666666666666, + "Malay": 0.26666666666666666, + "Indonesian": 0.26, + "Spanish": 0.25333333333333335, + "Chinese": 0.2866666666666667, + "Filipino": 0.29333333333333333 + }, + "consistency_score_2": 0.5980952380952381, + "consistency_score_3": 0.44228571428571434, + "consistency_score_4": 0.34971428571428564, + "consistency_score_5": 0.2866666666666666, + "consistency_score_6": 0.24095238095238097, + "consistency_score_7": 0.20666666666666667, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.5866666666666667, + "English,Malay": 0.5933333333333334, + "English,Indonesian": 0.64, + "English,Spanish": 0.5466666666666666, + "English,Chinese": 0.5133333333333333, + "English,Filipino": 0.64, + "Vietnamese,Malay": 0.58, + "Vietnamese,Indonesian": 0.6, + "Vietnamese,Spanish": 0.5666666666666667, + "Vietnamese,Chinese": 0.5733333333333334, + "Vietnamese,Filipino": 0.6066666666666667, + "Malay,Indonesian": 0.7333333333333333, + "Malay,Spanish": 0.56, + "Malay,Chinese": 0.5533333333333333, + "Malay,Filipino": 0.7266666666666667, + "Indonesian,Spanish": 0.5666666666666667, + "Indonesian,Chinese": 0.5733333333333334, + "Indonesian,Filipino": 0.7266666666666667, + "Spanish,Chinese": 0.49333333333333335, + "Spanish,Filipino": 0.5866666666666667, + "Chinese,Filipino": 0.5933333333333334 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.43333333333333335, + "English,Vietnamese,Indonesian": 0.4533333333333333, + "English,Vietnamese,Spanish": 0.41333333333333333, + "English,Vietnamese,Chinese": 0.3933333333333333, + "English,Vietnamese,Filipino": 0.4666666666666667, + "English,Malay,Indonesian": 0.5066666666666667, + "English,Malay,Spanish": 0.4066666666666667, + "English,Malay,Chinese": 0.37333333333333335, + "English,Malay,Filipino": 0.5133333333333333, + "English,Indonesian,Spanish": 0.43333333333333335, + "English,Indonesian,Chinese": 0.3933333333333333, + "English,Indonesian,Filipino": 0.52, + "English,Spanish,Chinese": 0.3466666666666667, + "English,Spanish,Filipino": 0.44, + "English,Chinese,Filipino": 0.3933333333333333, + "Vietnamese,Malay,Indonesian": 0.49333333333333335, + "Vietnamese,Malay,Spanish": 0.41333333333333333, + "Vietnamese,Malay,Chinese": 0.4066666666666667, + "Vietnamese,Malay,Filipino": 0.49333333333333335, + "Vietnamese,Indonesian,Spanish": 0.41333333333333333, + "Vietnamese,Indonesian,Chinese": 0.42, + "Vietnamese,Indonesian,Filipino": 0.48, + "Vietnamese,Spanish,Chinese": 0.38666666666666666, + "Vietnamese,Spanish,Filipino": 0.44666666666666666, + "Vietnamese,Chinese,Filipino": 0.44, + "Malay,Indonesian,Spanish": 0.48, + "Malay,Indonesian,Chinese": 0.46, + "Malay,Indonesian,Filipino": 0.6133333333333333, + "Malay,Spanish,Chinese": 0.36666666666666664, + "Malay,Spanish,Filipino": 0.48, + "Malay,Chinese,Filipino": 0.48, + "Indonesian,Spanish,Chinese": 0.37333333333333335, + "Indonesian,Spanish,Filipino": 0.47333333333333333, + "Indonesian,Chinese,Filipino": 0.4666666666666667, + "Spanish,Chinese,Filipino": 0.4066666666666667 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.38, + "English,Vietnamese,Malay,Spanish": 0.32666666666666666, + "English,Vietnamese,Malay,Chinese": 0.3, + "English,Vietnamese,Malay,Filipino": 0.38666666666666666, + "English,Vietnamese,Indonesian,Spanish": 0.3333333333333333, + "English,Vietnamese,Indonesian,Chinese": 0.32, + "English,Vietnamese,Indonesian,Filipino": 0.38666666666666666, + "English,Vietnamese,Spanish,Chinese": 0.29333333333333333, + "English,Vietnamese,Spanish,Filipino": 0.3466666666666667, + "English,Vietnamese,Chinese,Filipino": 0.32666666666666666, + "English,Malay,Indonesian,Spanish": 0.37333333333333335, + "English,Malay,Indonesian,Chinese": 0.3333333333333333, + "English,Malay,Indonesian,Filipino": 0.44666666666666666, + "English,Malay,Spanish,Chinese": 0.2733333333333333, + "English,Malay,Spanish,Filipino": 0.36666666666666664, + "English,Malay,Chinese,Filipino": 0.3333333333333333, + "English,Indonesian,Spanish,Chinese": 0.2866666666666667, + "English,Indonesian,Spanish,Filipino": 0.36666666666666664, + "English,Indonesian,Chinese,Filipino": 0.34, + "English,Spanish,Chinese,Filipino": 0.2866666666666667, + "Vietnamese,Malay,Indonesian,Spanish": 0.36666666666666664, + "Vietnamese,Malay,Indonesian,Chinese": 0.35333333333333333, + "Vietnamese,Malay,Indonesian,Filipino": 0.4266666666666667, + "Vietnamese,Malay,Spanish,Chinese": 0.31333333333333335, + "Vietnamese,Malay,Spanish,Filipino": 0.38, + "Vietnamese,Malay,Chinese,Filipino": 0.36666666666666664, + "Vietnamese,Indonesian,Spanish,Chinese": 0.31333333333333335, + "Vietnamese,Indonesian,Spanish,Filipino": 0.36, + "Vietnamese,Indonesian,Chinese,Filipino": 0.36, + "Vietnamese,Spanish,Chinese,Filipino": 0.35333333333333333, + "Malay,Indonesian,Spanish,Chinese": 0.32666666666666666, + "Malay,Indonesian,Spanish,Filipino": 0.4266666666666667, + "Malay,Indonesian,Chinese,Filipino": 0.4066666666666667, + "Malay,Spanish,Chinese,Filipino": 0.3466666666666667, + "Indonesian,Spanish,Chinese,Filipino": 0.3333333333333333 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.29333333333333333, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.2733333333333333, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.34, + "English,Vietnamese,Malay,Spanish,Chinese": 0.24, + "English,Vietnamese,Malay,Spanish,Filipino": 0.3, + "English,Vietnamese,Malay,Chinese,Filipino": 0.2733333333333333, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.24666666666666667, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.2866666666666667, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.28, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.26, + "English,Malay,Indonesian,Spanish,Chinese": 0.25333333333333335, + "English,Malay,Indonesian,Spanish,Filipino": 0.3333333333333333, + "English,Malay,Indonesian,Chinese,Filipino": 0.3, + "English,Malay,Spanish,Chinese,Filipino": 0.25333333333333335, + "English,Indonesian,Spanish,Chinese,Filipino": 0.25333333333333335, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.28, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.3333333333333333, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.32, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.3, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.29333333333333333, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.30666666666666664 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.22, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.26666666666666666, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.24666666666666667, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.22666666666666666, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.22666666666666666, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.23333333333333334, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.26666666666666666 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.20666666666666667 + } + }, + "AC3_2": 0.36612680869206476, + "AC3_3": 0.33049134836739164, + "AC3_4": 0.3007477712731073, + "AC3_5": 0.27476355242990164, + "AC3_6": 0.2518634321154215, + "AC3_7": 0.23176788119230304 + }, + "prompt_4": { + "overall_acc": 0.2380952380952381, + "language_acc": { + "English": 0.23333333333333334, + "Vietnamese": 0.23333333333333334, + "Malay": 0.24, + "Indonesian": 0.21333333333333335, + "Spanish": 0.24666666666666667, + "Chinese": 0.25333333333333335, + "Filipino": 0.24666666666666667 + }, + "consistency_score_2": 0.5114285714285715, + "consistency_score_3": 0.3257142857142858, + "consistency_score_4": 0.22685714285714284, + "consistency_score_5": 0.16507936507936508, + "consistency_score_6": 0.12095238095238095, + "consistency_score_7": 0.08666666666666667, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.5333333333333333, + "English,Malay": 0.52, + "English,Indonesian": 0.5666666666666667, + "English,Spanish": 0.44, + "English,Chinese": 0.48, + "English,Filipino": 0.58, + "Vietnamese,Malay": 0.44, + "Vietnamese,Indonesian": 0.5133333333333333, + "Vietnamese,Spanish": 0.4066666666666667, + "Vietnamese,Chinese": 0.5733333333333334, + "Vietnamese,Filipino": 0.44, + "Malay,Indonesian": 0.7666666666666667, + "Malay,Spanish": 0.42, + "Malay,Chinese": 0.41333333333333333, + "Malay,Filipino": 0.7066666666666667, + "Indonesian,Spanish": 0.42, + "Indonesian,Chinese": 0.4866666666666667, + "Indonesian,Filipino": 0.6933333333333334, + "Spanish,Chinese": 0.44, + "Spanish,Filipino": 0.44666666666666666, + "Chinese,Filipino": 0.4533333333333333 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.3333333333333333, + "English,Vietnamese,Indonesian": 0.37333333333333335, + "English,Vietnamese,Spanish": 0.24666666666666667, + "English,Vietnamese,Chinese": 0.36, + "English,Vietnamese,Filipino": 0.34, + "English,Malay,Indonesian": 0.4533333333333333, + "English,Malay,Spanish": 0.26666666666666666, + "English,Malay,Chinese": 0.2733333333333333, + "English,Malay,Filipino": 0.4533333333333333, + "English,Indonesian,Spanish": 0.2733333333333333, + "English,Indonesian,Chinese": 0.32666666666666666, + "English,Indonesian,Filipino": 0.44, + "English,Spanish,Chinese": 0.25333333333333335, + "English,Spanish,Filipino": 0.3, + "English,Chinese,Filipino": 0.32, + "Vietnamese,Malay,Indonesian": 0.3933333333333333, + "Vietnamese,Malay,Spanish": 0.22666666666666666, + "Vietnamese,Malay,Chinese": 0.26666666666666666, + "Vietnamese,Malay,Filipino": 0.34, + "Vietnamese,Indonesian,Spanish": 0.24666666666666667, + "Vietnamese,Indonesian,Chinese": 0.3333333333333333, + "Vietnamese,Indonesian,Filipino": 0.35333333333333333, + "Vietnamese,Spanish,Chinese": 0.2866666666666667, + "Vietnamese,Spanish,Filipino": 0.23333333333333334, + "Vietnamese,Chinese,Filipino": 0.29333333333333333, + "Malay,Indonesian,Spanish": 0.35333333333333333, + "Malay,Indonesian,Chinese": 0.37333333333333335, + "Malay,Indonesian,Filipino": 0.6066666666666667, + "Malay,Spanish,Chinese": 0.23333333333333334, + "Malay,Spanish,Filipino": 0.3333333333333333, + "Malay,Chinese,Filipino": 0.3333333333333333, + "Indonesian,Spanish,Chinese": 0.25333333333333335, + "Indonesian,Spanish,Filipino": 0.32, + "Indonesian,Chinese,Filipino": 0.35333333333333333, + "Spanish,Chinese,Filipino": 0.25333333333333335 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.3, + "English,Vietnamese,Malay,Spanish": 0.16666666666666666, + "English,Vietnamese,Malay,Chinese": 0.20666666666666667, + "English,Vietnamese,Malay,Filipino": 0.28, + "English,Vietnamese,Indonesian,Spanish": 0.18666666666666668, + "English,Vietnamese,Indonesian,Chinese": 0.26, + "English,Vietnamese,Indonesian,Filipino": 0.2733333333333333, + "English,Vietnamese,Spanish,Chinese": 0.16666666666666666, + "English,Vietnamese,Spanish,Filipino": 0.17333333333333334, + "English,Vietnamese,Chinese,Filipino": 0.22666666666666666, + "English,Malay,Indonesian,Spanish": 0.23333333333333334, + "English,Malay,Indonesian,Chinese": 0.25333333333333335, + "English,Malay,Indonesian,Filipino": 0.4, + "English,Malay,Spanish,Chinese": 0.14666666666666667, + "English,Malay,Spanish,Filipino": 0.23333333333333334, + "English,Malay,Chinese,Filipino": 0.24666666666666667, + "English,Indonesian,Spanish,Chinese": 0.17333333333333334, + "English,Indonesian,Spanish,Filipino": 0.22666666666666666, + "English,Indonesian,Chinese,Filipino": 0.25333333333333335, + "English,Spanish,Chinese,Filipino": 0.18666666666666668, + "Vietnamese,Malay,Indonesian,Spanish": 0.20666666666666667, + "Vietnamese,Malay,Indonesian,Chinese": 0.25333333333333335, + "Vietnamese,Malay,Indonesian,Filipino": 0.31333333333333335, + "Vietnamese,Malay,Spanish,Chinese": 0.16, + "Vietnamese,Malay,Spanish,Filipino": 0.18666666666666668, + "Vietnamese,Malay,Chinese,Filipino": 0.22666666666666666, + "Vietnamese,Indonesian,Spanish,Chinese": 0.18, + "Vietnamese,Indonesian,Spanish,Filipino": 0.18666666666666668, + "Vietnamese,Indonesian,Chinese,Filipino": 0.24666666666666667, + "Vietnamese,Spanish,Chinese,Filipino": 0.18, + "Malay,Indonesian,Spanish,Chinese": 0.20666666666666667, + "Malay,Indonesian,Spanish,Filipino": 0.29333333333333333, + "Malay,Indonesian,Chinese,Filipino": 0.31333333333333335, + "Malay,Spanish,Chinese,Filipino": 0.19333333333333333, + "Indonesian,Spanish,Chinese,Filipino": 0.2 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.15333333333333332, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.2, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.25333333333333335, + "English,Vietnamese,Malay,Spanish,Chinese": 0.1, + "English,Vietnamese,Malay,Spanish,Filipino": 0.14, + "English,Vietnamese,Malay,Chinese,Filipino": 0.18, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.12666666666666668, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.14, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.19333333333333333, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.12, + "English,Malay,Indonesian,Spanish,Chinese": 0.13333333333333333, + "English,Malay,Indonesian,Spanish,Filipino": 0.20666666666666667, + "English,Malay,Indonesian,Chinese,Filipino": 0.22666666666666666, + "English,Malay,Spanish,Chinese,Filipino": 0.14, + "English,Indonesian,Spanish,Chinese,Filipino": 0.14666666666666667, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.14666666666666667, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.17333333333333334, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.22, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.14, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.14666666666666667, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.18 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.09333333333333334, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.12666666666666668, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.17333333333333334, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.09333333333333334, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.1, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.12666666666666668, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.13333333333333333 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.08666666666666667 + } + }, + "AC3_2": 0.3249228534690836, + "AC3_3": 0.2750965250477327, + "AC3_4": 0.232340375674264, + "AC3_5": 0.1949756279981341, + "AC3_6": 0.1604142983006551, + "AC3_7": 0.12707722381228806 + }, + "prompt_5": { + "overall_acc": 0.26285714285714284, + "language_acc": { + "English": 0.22666666666666666, + "Vietnamese": 0.26, + "Malay": 0.2733333333333333, + "Indonesian": 0.22666666666666666, + "Spanish": 0.2866666666666667, + "Chinese": 0.28, + "Filipino": 0.2866666666666667 + }, + "consistency_score_2": 0.4933333333333334, + "consistency_score_3": 0.30380952380952386, + "consistency_score_4": 0.2015238095238095, + "consistency_score_5": 0.1365079365079365, + "consistency_score_6": 0.09142857142857144, + "consistency_score_7": 0.06, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.4266666666666667, + "English,Malay": 0.5066666666666667, + "English,Indonesian": 0.5266666666666666, + "English,Spanish": 0.5266666666666666, + "English,Chinese": 0.47333333333333333, + "English,Filipino": 0.54, + "Vietnamese,Malay": 0.36666666666666664, + "Vietnamese,Indonesian": 0.41333333333333333, + "Vietnamese,Spanish": 0.3333333333333333, + "Vietnamese,Chinese": 0.48, + "Vietnamese,Filipino": 0.36666666666666664, + "Malay,Indonesian": 0.6133333333333333, + "Malay,Spanish": 0.54, + "Malay,Chinese": 0.4866666666666667, + "Malay,Filipino": 0.62, + "Indonesian,Spanish": 0.5533333333333333, + "Indonesian,Chinese": 0.5266666666666666, + "Indonesian,Filipino": 0.58, + "Spanish,Chinese": 0.46, + "Spanish,Filipino": 0.5533333333333333, + "Chinese,Filipino": 0.4666666666666667 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.23333333333333334, + "English,Vietnamese,Indonesian": 0.26, + "English,Vietnamese,Spanish": 0.21333333333333335, + "English,Vietnamese,Chinese": 0.26, + "English,Vietnamese,Filipino": 0.25333333333333335, + "English,Malay,Indonesian": 0.37333333333333335, + "English,Malay,Spanish": 0.3333333333333333, + "English,Malay,Chinese": 0.3, + "English,Malay,Filipino": 0.4, + "English,Indonesian,Spanish": 0.35333333333333333, + "English,Indonesian,Chinese": 0.3333333333333333, + "English,Indonesian,Filipino": 0.38666666666666666, + "English,Spanish,Chinese": 0.2866666666666667, + "English,Spanish,Filipino": 0.36666666666666664, + "English,Chinese,Filipino": 0.31333333333333335, + "Vietnamese,Malay,Indonesian": 0.25333333333333335, + "Vietnamese,Malay,Spanish": 0.18666666666666668, + "Vietnamese,Malay,Chinese": 0.24, + "Vietnamese,Malay,Filipino": 0.23333333333333334, + "Vietnamese,Indonesian,Spanish": 0.22, + "Vietnamese,Indonesian,Chinese": 0.2733333333333333, + "Vietnamese,Indonesian,Filipino": 0.24666666666666667, + "Vietnamese,Spanish,Chinese": 0.22, + "Vietnamese,Spanish,Filipino": 0.21333333333333335, + "Vietnamese,Chinese,Filipino": 0.22, + "Malay,Indonesian,Spanish": 0.41333333333333333, + "Malay,Indonesian,Chinese": 0.38, + "Malay,Indonesian,Filipino": 0.44666666666666666, + "Malay,Spanish,Chinese": 0.29333333333333333, + "Malay,Spanish,Filipino": 0.4066666666666667, + "Malay,Chinese,Filipino": 0.3333333333333333, + "Indonesian,Spanish,Chinese": 0.32, + "Indonesian,Spanish,Filipino": 0.41333333333333333, + "Indonesian,Chinese,Filipino": 0.3466666666666667, + "Spanish,Chinese,Filipino": 0.30666666666666664 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.18666666666666668, + "English,Vietnamese,Malay,Spanish": 0.13333333333333333, + "English,Vietnamese,Malay,Chinese": 0.16, + "English,Vietnamese,Malay,Filipino": 0.17333333333333334, + "English,Vietnamese,Indonesian,Spanish": 0.16666666666666666, + "English,Vietnamese,Indonesian,Chinese": 0.18666666666666668, + "English,Vietnamese,Indonesian,Filipino": 0.18, + "English,Vietnamese,Spanish,Chinese": 0.14666666666666667, + "English,Vietnamese,Spanish,Filipino": 0.16, + "English,Vietnamese,Chinese,Filipino": 0.16, + "English,Malay,Indonesian,Spanish": 0.2733333333333333, + "English,Malay,Indonesian,Chinese": 0.24666666666666667, + "English,Malay,Indonesian,Filipino": 0.32, + "English,Malay,Spanish,Chinese": 0.19333333333333333, + "English,Malay,Spanish,Filipino": 0.2866666666666667, + "English,Malay,Chinese,Filipino": 0.23333333333333334, + "English,Indonesian,Spanish,Chinese": 0.22, + "English,Indonesian,Spanish,Filipino": 0.29333333333333333, + "English,Indonesian,Chinese,Filipino": 0.26, + "English,Spanish,Chinese,Filipino": 0.21333333333333335, + "Vietnamese,Malay,Indonesian,Spanish": 0.15333333333333332, + "Vietnamese,Malay,Indonesian,Chinese": 0.19333333333333333, + "Vietnamese,Malay,Indonesian,Filipino": 0.18, + "Vietnamese,Malay,Spanish,Chinese": 0.13333333333333333, + "Vietnamese,Malay,Spanish,Filipino": 0.15333333333333332, + "Vietnamese,Malay,Chinese,Filipino": 0.14666666666666667, + "Vietnamese,Indonesian,Spanish,Chinese": 0.14, + "Vietnamese,Indonesian,Spanish,Filipino": 0.15333333333333332, + "Vietnamese,Indonesian,Chinese,Filipino": 0.15333333333333332, + "Vietnamese,Spanish,Chinese,Filipino": 0.14666666666666667, + "Malay,Indonesian,Spanish,Chinese": 0.23333333333333334, + "Malay,Indonesian,Spanish,Filipino": 0.34, + "Malay,Indonesian,Chinese,Filipino": 0.28, + "Malay,Spanish,Chinese,Filipino": 0.22, + "Indonesian,Spanish,Chinese,Filipino": 0.23333333333333334 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.12, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.14, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.14666666666666667, + "English,Vietnamese,Malay,Spanish,Chinese": 0.09333333333333334, + "English,Vietnamese,Malay,Spanish,Filipino": 0.11333333333333333, + "English,Vietnamese,Malay,Chinese,Filipino": 0.11333333333333333, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.11333333333333333, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.12, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.12666666666666668, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.10666666666666667, + "English,Malay,Indonesian,Spanish,Chinese": 0.16, + "English,Malay,Indonesian,Spanish,Filipino": 0.24666666666666667, + "English,Malay,Indonesian,Chinese,Filipino": 0.20666666666666667, + "English,Malay,Spanish,Chinese,Filipino": 0.16, + "English,Indonesian,Spanish,Chinese,Filipino": 0.17333333333333334, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.1, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.12666666666666668, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.12, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.1, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.09333333333333334, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.18666666666666668 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.08, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.1, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.1, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.07333333333333333, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.07333333333333333, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.14, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.07333333333333333 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.06 + } + }, + "AC3_2": 0.3429722921460805, + "AC3_3": 0.2818535413668278, + "AC3_4": 0.22814016167593953, + "AC3_5": 0.17969566200360784, + "AC3_6": 0.1356682027266835, + "AC3_7": 0.09769911501398701 + } }, "cross_logiqa": { "prompt_1": { @@ -99185,9 +99656,480 @@ "AC3_6": 0.23055914412213743, "AC3_7": 0.20402398743672984 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_3": { + "overall_acc": 0.24675324675324675, + "language_acc": { + "Vietnamese": 0.24431818181818182, + "Indonesian": 0.2215909090909091, + "Malay": 0.23863636363636365, + "English": 0.23863636363636365, + "Spanish": 0.2727272727272727, + "Filipino": 0.23863636363636365, + "Chinese": 0.2727272727272727 + }, + "consistency_score_2": 0.6260822510822509, + "consistency_score_3": 0.472564935064935, + "consistency_score_4": 0.37499999999999983, + "consistency_score_5": 0.30303030303030304, + "consistency_score_6": 0.24594155844155846, + "consistency_score_7": 0.19886363636363635, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.8125, + "Vietnamese,Malay": 0.7670454545454546, + "Vietnamese,English": 0.6022727272727273, + "Vietnamese,Spanish": 0.4943181818181818, + "Vietnamese,Filipino": 0.7784090909090909, + "Vietnamese,Chinese": 0.6079545454545454, + "Indonesian,Malay": 0.9090909090909091, + "Indonesian,English": 0.5625, + "Indonesian,Spanish": 0.4318181818181818, + "Indonesian,Filipino": 0.9375, + "Indonesian,Chinese": 0.6477272727272727, + "Malay,English": 0.5397727272727273, + "Malay,Spanish": 0.42613636363636365, + "Malay,Filipino": 0.9261363636363636, + "Malay,Chinese": 0.6306818181818182, + "English,Spanish": 0.5340909090909091, + "English,Filipino": 0.5340909090909091, + "English,Chinese": 0.5568181818181818, + "Spanish,Filipino": 0.42613636363636365, + "Spanish,Chinese": 0.3977272727272727, + "Filipino,Chinese": 0.625 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.75, + "Vietnamese,Indonesian,English": 0.5170454545454546, + "Vietnamese,Indonesian,Spanish": 0.39204545454545453, + "Vietnamese,Indonesian,Filipino": 0.7670454545454546, + "Vietnamese,Indonesian,Chinese": 0.5738636363636364, + "Vietnamese,Malay,English": 0.48295454545454547, + "Vietnamese,Malay,Spanish": 0.375, + "Vietnamese,Malay,Filipino": 0.7386363636363636, + "Vietnamese,Malay,Chinese": 0.5454545454545454, + "Vietnamese,English,Spanish": 0.3522727272727273, + "Vietnamese,English,Filipino": 0.48863636363636365, + "Vietnamese,English,Chinese": 0.4431818181818182, + "Vietnamese,Spanish,Filipino": 0.375, + "Vietnamese,Spanish,Chinese": 0.29545454545454547, + "Vietnamese,Filipino,Chinese": 0.5454545454545454, + "Indonesian,Malay,English": 0.5227272727272727, + "Indonesian,Malay,Spanish": 0.4034090909090909, + "Indonesian,Malay,Filipino": 0.8863636363636364, + "Indonesian,Malay,Chinese": 0.6079545454545454, + "Indonesian,English,Spanish": 0.3068181818181818, + "Indonesian,English,Filipino": 0.5284090909090909, + "Indonesian,English,Chinese": 0.44886363636363635, + "Indonesian,Spanish,Filipino": 0.4090909090909091, + "Indonesian,Spanish,Chinese": 0.3068181818181818, + "Indonesian,Filipino,Chinese": 0.6136363636363636, + "Malay,English,Spanish": 0.2897727272727273, + "Malay,English,Filipino": 0.5113636363636364, + "Malay,English,Chinese": 0.4375, + "Malay,Spanish,Filipino": 0.4090909090909091, + "Malay,Spanish,Chinese": 0.3068181818181818, + "Malay,Filipino,Chinese": 0.5965909090909091, + "English,Spanish,Filipino": 0.29545454545454547, + "English,Spanish,Chinese": 0.2840909090909091, + "English,Filipino,Chinese": 0.4318181818181818, + "Spanish,Filipino,Chinese": 0.30113636363636365 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.4772727272727273, + "Vietnamese,Indonesian,Malay,Spanish": 0.36363636363636365, + "Vietnamese,Indonesian,Malay,Filipino": 0.7272727272727273, + "Vietnamese,Indonesian,Malay,Chinese": 0.5397727272727273, + "Vietnamese,Indonesian,English,Spanish": 0.2897727272727273, + "Vietnamese,Indonesian,English,Filipino": 0.48295454545454547, + "Vietnamese,Indonesian,English,Chinese": 0.4147727272727273, + "Vietnamese,Indonesian,Spanish,Filipino": 0.3693181818181818, + "Vietnamese,Indonesian,Spanish,Chinese": 0.2784090909090909, + "Vietnamese,Indonesian,Filipino,Chinese": 0.5397727272727273, + "Vietnamese,Malay,English,Spanish": 0.2727272727272727, + "Vietnamese,Malay,English,Filipino": 0.4659090909090909, + "Vietnamese,Malay,English,Chinese": 0.4034090909090909, + "Vietnamese,Malay,Spanish,Filipino": 0.35795454545454547, + "Vietnamese,Malay,Spanish,Chinese": 0.2727272727272727, + "Vietnamese,Malay,Filipino,Chinese": 0.5227272727272727, + "Vietnamese,English,Spanish,Filipino": 0.2784090909090909, + "Vietnamese,English,Spanish,Chinese": 0.22727272727272727, + "Vietnamese,English,Filipino,Chinese": 0.3977272727272727, + "Vietnamese,Spanish,Filipino,Chinese": 0.26704545454545453, + "Indonesian,Malay,English,Spanish": 0.2840909090909091, + "Indonesian,Malay,English,Filipino": 0.5056818181818182, + "Indonesian,Malay,English,Chinese": 0.4318181818181818, + "Indonesian,Malay,Spanish,Filipino": 0.39204545454545453, + "Indonesian,Malay,Spanish,Chinese": 0.29545454545454547, + "Indonesian,Malay,Filipino,Chinese": 0.5852272727272727, + "Indonesian,English,Spanish,Filipino": 0.2897727272727273, + "Indonesian,English,Spanish,Chinese": 0.22727272727272727, + "Indonesian,English,Filipino,Chinese": 0.42613636363636365, + "Indonesian,Spanish,Filipino,Chinese": 0.2897727272727273, + "Malay,English,Spanish,Filipino": 0.2840909090909091, + "Malay,English,Spanish,Chinese": 0.22727272727272727, + "Malay,English,Filipino,Chinese": 0.42045454545454547, + "Malay,Spanish,Filipino,Chinese": 0.29545454545454547, + "English,Spanish,Filipino,Chinese": 0.2215909090909091 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.26704545454545453, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.4602272727272727, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.3977272727272727, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.3522727272727273, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.26704545454545453, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.5170454545454546, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.2727272727272727, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.21022727272727273, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.39204545454545453, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.26136363636363635, + "Vietnamese,Malay,English,Spanish,Filipino": 0.26704545454545453, + "Vietnamese,Malay,English,Spanish,Chinese": 0.21022727272727273, + "Vietnamese,Malay,English,Filipino,Chinese": 0.38636363636363635, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.26136363636363635, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.20454545454545456, + "Indonesian,Malay,English,Spanish,Filipino": 0.2784090909090909, + "Indonesian,Malay,English,Spanish,Chinese": 0.2215909090909091, + "Indonesian,Malay,English,Filipino,Chinese": 0.4147727272727273, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.2840909090909091, + "Indonesian,English,Spanish,Filipino,Chinese": 0.2159090909090909, + "Malay,English,Spanish,Filipino,Chinese": 0.2215909090909091 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.26136363636363635, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.20454545454545456, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.3806818181818182, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.2556818181818182, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.19886363636363635, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.20454545454545456, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.2159090909090909 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.19886363636363635 + } + }, + "AC3_2": 0.35399070856887505, + "AC3_3": 0.3242151664178935, + "AC3_4": 0.2976501305004301, + "AC3_5": 0.2720114530643392, + "AC3_6": 0.2463467339408857, + "AC3_7": 0.22023513821850263 + }, + "prompt_4": { + "overall_acc": 0.24756493506493507, + "language_acc": { + "Vietnamese": 0.24431818181818182, + "Indonesian": 0.23295454545454544, + "Malay": 0.25, + "English": 0.22727272727272727, + "Spanish": 0.26136363636363635, + "Filipino": 0.23863636363636365, + "Chinese": 0.2784090909090909 + }, + "consistency_score_2": 0.5116341991341992, + "consistency_score_3": 0.29999999999999993, + "consistency_score_4": 0.19334415584415585, + "consistency_score_5": 0.13501082251082253, + "consistency_score_6": 0.09902597402597402, + "consistency_score_7": 0.07386363636363637, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.4375, + "Vietnamese,Malay": 0.42045454545454547, + "Vietnamese,English": 0.6193181818181818, + "Vietnamese,Spanish": 0.5852272727272727, + "Vietnamese,Filipino": 0.4090909090909091, + "Vietnamese,Chinese": 0.48863636363636365, + "Indonesian,Malay": 0.9431818181818182, + "Indonesian,English": 0.3522727272727273, + "Indonesian,Spanish": 0.29545454545454547, + "Indonesian,Filipino": 0.9545454545454546, + "Indonesian,Chinese": 0.5284090909090909, + "Malay,English": 0.3352272727272727, + "Malay,Spanish": 0.2897727272727273, + "Malay,Filipino": 0.9431818181818182, + "Malay,Chinese": 0.5113636363636364, + "English,Spanish": 0.6363636363636364, + "English,Filipino": 0.3181818181818182, + "English,Chinese": 0.4943181818181818, + "Spanish,Filipino": 0.2727272727272727, + "Spanish,Chinese": 0.4090909090909091, + "Filipino,Chinese": 0.5 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.4090909090909091, + "Vietnamese,Indonesian,English": 0.23863636363636365, + "Vietnamese,Indonesian,Spanish": 0.1875, + "Vietnamese,Indonesian,Filipino": 0.4034090909090909, + "Vietnamese,Indonesian,Chinese": 0.29545454545454547, + "Vietnamese,Malay,English": 0.2215909090909091, + "Vietnamese,Malay,Spanish": 0.18181818181818182, + "Vietnamese,Malay,Filipino": 0.39204545454545453, + "Vietnamese,Malay,Chinese": 0.2840909090909091, + "Vietnamese,English,Spanish": 0.44886363636363635, + "Vietnamese,English,Filipino": 0.21022727272727273, + "Vietnamese,English,Chinese": 0.3352272727272727, + "Vietnamese,Spanish,Filipino": 0.16477272727272727, + "Vietnamese,Spanish,Chinese": 0.26704545454545453, + "Vietnamese,Filipino,Chinese": 0.26704545454545453, + "Indonesian,Malay,English": 0.32386363636363635, + "Indonesian,Malay,Spanish": 0.26704545454545453, + "Indonesian,Malay,Filipino": 0.9204545454545454, + "Indonesian,Malay,Chinese": 0.5, + "Indonesian,English,Spanish": 0.19318181818181818, + "Indonesian,English,Filipino": 0.3125, + "Indonesian,English,Chinese": 0.2556818181818182, + "Indonesian,Spanish,Filipino": 0.26136363636363635, + "Indonesian,Spanish,Chinese": 0.17613636363636365, + "Indonesian,Filipino,Chinese": 0.4943181818181818, + "Malay,English,Spanish": 0.18181818181818182, + "Malay,English,Filipino": 0.3068181818181818, + "Malay,English,Chinese": 0.24431818181818182, + "Malay,Spanish,Filipino": 0.2556818181818182, + "Malay,Spanish,Chinese": 0.16477272727272727, + "Malay,Filipino,Chinese": 0.48863636363636365, + "English,Spanish,Filipino": 0.16477272727272727, + "English,Spanish,Chinese": 0.3125, + "English,Filipino,Chinese": 0.2215909090909091, + "Spanish,Filipino,Chinese": 0.14772727272727273 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.2159090909090909, + "Vietnamese,Indonesian,Malay,Spanish": 0.17045454545454544, + "Vietnamese,Indonesian,Malay,Filipino": 0.39204545454545453, + "Vietnamese,Indonesian,Malay,Chinese": 0.2784090909090909, + "Vietnamese,Indonesian,English,Spanish": 0.13636363636363635, + "Vietnamese,Indonesian,English,Filipino": 0.20454545454545456, + "Vietnamese,Indonesian,English,Chinese": 0.17613636363636365, + "Vietnamese,Indonesian,Spanish,Filipino": 0.1590909090909091, + "Vietnamese,Indonesian,Spanish,Chinese": 0.125, + "Vietnamese,Indonesian,Filipino,Chinese": 0.26704545454545453, + "Vietnamese,Malay,English,Spanish": 0.125, + "Vietnamese,Malay,English,Filipino": 0.19886363636363635, + "Vietnamese,Malay,English,Chinese": 0.17045454545454544, + "Vietnamese,Malay,Spanish,Filipino": 0.1534090909090909, + "Vietnamese,Malay,Spanish,Chinese": 0.11931818181818182, + "Vietnamese,Malay,Filipino,Chinese": 0.26136363636363635, + "Vietnamese,English,Spanish,Filipino": 0.11363636363636363, + "Vietnamese,English,Spanish,Chinese": 0.23295454545454544, + "Vietnamese,English,Filipino,Chinese": 0.14772727272727273, + "Vietnamese,Spanish,Filipino,Chinese": 0.09659090909090909, + "Indonesian,Malay,English,Spanish": 0.17045454545454544, + "Indonesian,Malay,English,Filipino": 0.3068181818181818, + "Indonesian,Malay,English,Chinese": 0.23863636363636365, + "Indonesian,Malay,Spanish,Filipino": 0.25, + "Indonesian,Malay,Spanish,Chinese": 0.1534090909090909, + "Indonesian,Malay,Filipino,Chinese": 0.48295454545454547, + "Indonesian,English,Spanish,Filipino": 0.1590909090909091, + "Indonesian,English,Spanish,Chinese": 0.14204545454545456, + "Indonesian,English,Filipino,Chinese": 0.2215909090909091, + "Indonesian,Spanish,Filipino,Chinese": 0.14204545454545456, + "Malay,English,Spanish,Filipino": 0.1534090909090909, + "Malay,English,Spanish,Chinese": 0.13068181818181818, + "Malay,English,Filipino,Chinese": 0.2215909090909091, + "Malay,Spanish,Filipino,Chinese": 0.14204545454545456, + "English,Spanish,Filipino,Chinese": 0.10795454545454546 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.11931818181818182, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.19886363636363635, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.16477272727272727, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.1534090909090909, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.11363636363636363, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.26136363636363635, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.10795454545454546, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.10227272727272728, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.14772727272727273, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.09659090909090909, + "Vietnamese,Malay,English,Spanish,Filipino": 0.10227272727272728, + "Vietnamese,Malay,English,Spanish,Chinese": 0.09659090909090909, + "Vietnamese,Malay,English,Filipino,Chinese": 0.14772727272727273, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.09659090909090909, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.07386363636363637, + "Indonesian,Malay,English,Spanish,Filipino": 0.1534090909090909, + "Indonesian,Malay,English,Spanish,Chinese": 0.125, + "Indonesian,Malay,English,Filipino,Chinese": 0.2215909090909091, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.13636363636363635, + "Indonesian,English,Spanish,Filipino,Chinese": 0.10795454545454546, + "Malay,English,Spanish,Filipino,Chinese": 0.10795454545454546 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.10227272727272728, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.09090909090909091, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.14772727272727273, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.09659090909090909, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.07386363636363637, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.07386363636363637, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.10795454545454546 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.07386363636363637 + } + }, + "AC3_2": 0.333674477652266, + "AC3_3": 0.27127186475923404, + "AC3_4": 0.21712064624113733, + "AC3_5": 0.17473112102561852, + "AC3_6": 0.14146567713914657, + "AC3_7": 0.11377984385808208 + }, + "prompt_5": { + "overall_acc": 0.24675324675324672, + "language_acc": { + "Vietnamese": 0.25, + "Indonesian": 0.22727272727272727, + "Malay": 0.22727272727272727, + "English": 0.23295454545454544, + "Spanish": 0.2556818181818182, + "Filipino": 0.23863636363636365, + "Chinese": 0.29545454545454547 + }, + "consistency_score_2": 0.44237012987012975, + "consistency_score_3": 0.23620129870129872, + "consistency_score_4": 0.14464285714285716, + "consistency_score_5": 0.10010822510822512, + "consistency_score_6": 0.0762987012987013, + "consistency_score_7": 0.0625, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.39204545454545453, + "Vietnamese,Malay": 0.38636363636363635, + "Vietnamese,English": 0.5568181818181818, + "Vietnamese,Spanish": 0.5284090909090909, + "Vietnamese,Filipino": 0.3693181818181818, + "Vietnamese,Chinese": 0.35795454545454547, + "Indonesian,Malay": 0.8068181818181818, + "Indonesian,English": 0.3181818181818182, + "Indonesian,Spanish": 0.3352272727272727, + "Indonesian,Filipino": 0.8011363636363636, + "Indonesian,Chinese": 0.39204545454545453, + "Malay,English": 0.3068181818181818, + "Malay,Spanish": 0.3125, + "Malay,Filipino": 0.8977272727272727, + "Malay,Chinese": 0.375, + "English,Spanish": 0.5965909090909091, + "English,Filipino": 0.26136363636363635, + "English,Chinese": 0.2840909090909091, + "Spanish,Filipino": 0.2897727272727273, + "Spanish,Chinese": 0.375, + "Filipino,Chinese": 0.3465909090909091 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.32954545454545453, + "Vietnamese,Indonesian,English": 0.2159090909090909, + "Vietnamese,Indonesian,Spanish": 0.19318181818181818, + "Vietnamese,Indonesian,Filipino": 0.3068181818181818, + "Vietnamese,Indonesian,Chinese": 0.19318181818181818, + "Vietnamese,Malay,English": 0.19886363636363635, + "Vietnamese,Malay,Spanish": 0.16477272727272727, + "Vietnamese,Malay,Filipino": 0.3409090909090909, + "Vietnamese,Malay,Chinese": 0.1875, + "Vietnamese,English,Spanish": 0.39204545454545453, + "Vietnamese,English,Filipino": 0.17045454545454544, + "Vietnamese,English,Chinese": 0.20454545454545456, + "Vietnamese,Spanish,Filipino": 0.14204545454545456, + "Vietnamese,Spanish,Chinese": 0.19886363636363635, + "Vietnamese,Filipino,Chinese": 0.17613636363636365, + "Indonesian,Malay,English": 0.2556818181818182, + "Indonesian,Malay,Spanish": 0.26136363636363635, + "Indonesian,Malay,Filipino": 0.7556818181818182, + "Indonesian,Malay,Chinese": 0.3125, + "Indonesian,English,Spanish": 0.20454545454545456, + "Indonesian,English,Filipino": 0.2215909090909091, + "Indonesian,English,Chinese": 0.1590909090909091, + "Indonesian,Spanish,Filipino": 0.23863636363636365, + "Indonesian,Spanish,Chinese": 0.17045454545454544, + "Indonesian,Filipino,Chinese": 0.29545454545454547, + "Malay,English,Spanish": 0.1875, + "Malay,English,Filipino": 0.24431818181818182, + "Malay,English,Chinese": 0.14772727272727273, + "Malay,Spanish,Filipino": 0.2727272727272727, + "Malay,Spanish,Chinese": 0.16477272727272727, + "Malay,Filipino,Chinese": 0.32954545454545453, + "English,Spanish,Filipino": 0.1590909090909091, + "English,Spanish,Chinese": 0.19318181818181818, + "English,Filipino,Chinese": 0.125, + "Spanish,Filipino,Chinese": 0.1534090909090909 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.17045454545454544, + "Vietnamese,Indonesian,Malay,Spanish": 0.14772727272727273, + "Vietnamese,Indonesian,Malay,Filipino": 0.2897727272727273, + "Vietnamese,Indonesian,Malay,Chinese": 0.17045454545454544, + "Vietnamese,Indonesian,English,Spanish": 0.1590909090909091, + "Vietnamese,Indonesian,English,Filipino": 0.14204545454545456, + "Vietnamese,Indonesian,English,Chinese": 0.125, + "Vietnamese,Indonesian,Spanish,Filipino": 0.11931818181818182, + "Vietnamese,Indonesian,Spanish,Chinese": 0.10795454545454546, + "Vietnamese,Indonesian,Filipino,Chinese": 0.1590909090909091, + "Vietnamese,Malay,English,Spanish": 0.13068181818181818, + "Vietnamese,Malay,English,Filipino": 0.1534090909090909, + "Vietnamese,Malay,English,Chinese": 0.11363636363636363, + "Vietnamese,Malay,Spanish,Filipino": 0.13068181818181818, + "Vietnamese,Malay,Spanish,Chinese": 0.09659090909090909, + "Vietnamese,Malay,Filipino,Chinese": 0.1590909090909091, + "Vietnamese,English,Spanish,Filipino": 0.10795454545454546, + "Vietnamese,English,Spanish,Chinese": 0.14772727272727273, + "Vietnamese,English,Filipino,Chinese": 0.09659090909090909, + "Vietnamese,Spanish,Filipino,Chinese": 0.08522727272727272, + "Indonesian,Malay,English,Spanish": 0.1590909090909091, + "Indonesian,Malay,English,Filipino": 0.21022727272727273, + "Indonesian,Malay,English,Chinese": 0.13636363636363635, + "Indonesian,Malay,Spanish,Filipino": 0.22727272727272727, + "Indonesian,Malay,Spanish,Chinese": 0.14772727272727273, + "Indonesian,Malay,Filipino,Chinese": 0.2840909090909091, + "Indonesian,English,Spanish,Filipino": 0.13068181818181818, + "Indonesian,English,Spanish,Chinese": 0.10795454545454546, + "Indonesian,English,Filipino,Chinese": 0.11363636363636363, + "Indonesian,Spanish,Filipino,Chinese": 0.13068181818181818, + "Malay,English,Spanish,Filipino": 0.14772727272727273, + "Malay,English,Spanish,Chinese": 0.10227272727272728, + "Malay,English,Filipino,Chinese": 0.11363636363636363, + "Malay,Spanish,Filipino,Chinese": 0.14772727272727273, + "English,Spanish,Filipino,Chinese": 0.09090909090909091 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.11931818181818182, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.13068181818181818, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.10795454545454546, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.11363636363636363, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.09659090909090909, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.14772727272727273, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.09090909090909091, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.09090909090909091, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.09090909090909091, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.07954545454545454, + "Vietnamese,Malay,English,Spanish,Filipino": 0.09659090909090909, + "Vietnamese,Malay,English,Spanish,Chinese": 0.07954545454545454, + "Vietnamese,Malay,English,Filipino,Chinese": 0.08522727272727272, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.07954545454545454, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.06818181818181818, + "Indonesian,Malay,English,Spanish,Filipino": 0.125, + "Indonesian,Malay,English,Spanish,Chinese": 0.09659090909090909, + "Indonesian,Malay,English,Filipino,Chinese": 0.10795454545454546, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.13068181818181818, + "Indonesian,English,Spanish,Filipino,Chinese": 0.07954545454545454, + "Malay,English,Spanish,Filipino,Chinese": 0.08522727272727272 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.08522727272727272, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.07954545454545454, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.08522727272727272, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.07954545454545454, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.0625, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.0625, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.07954545454545454 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.0625 + } + }, + "AC3_2": 0.31679745456067077, + "AC3_3": 0.2413619992952165, + "AC3_4": 0.1823783847374287, + "AC3_5": 0.14243167125176273, + "AC3_6": 0.11655680998806679, + "AC3_7": 0.09973753277614786 + } }, "sg_eval": { "prompt_1": { @@ -99196,9 +100138,15 @@ "prompt_2": { "accuracy": 0.2815533980582524 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_3": { + "accuracy": 0.2621359223300971 + }, + "prompt_4": { + "accuracy": 0.2524271844660194 + }, + "prompt_5": { + "accuracy": 0.22330097087378642 + } }, "cn_eval": { "prompt_1": { @@ -99207,18 +100155,30 @@ "prompt_2": { "accuracy": 0.2571428571428571 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_3": { + "accuracy": 0.26666666666666666 + }, + "prompt_4": { + "accuracy": 0.3523809523809524 + }, + "prompt_5": { + "accuracy": 0.29523809523809524 + } }, "us_eval": { "prompt_1": { "accuracy": 0.2803738317757009 }, "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_3": { + "accuracy": 0.22429906542056074 + }, + "prompt_4": { + "accuracy": 0.24299065420560748 + }, + "prompt_5": { + "accuracy": 0.2523364485981308 + } }, "ph_eval": { "prompt_1": { @@ -99251,9 +100211,51 @@ "geography": 0.2 } }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_3": { + "accuracy": 0.16, + "category_acc": { + "brand": 0.1, + "demographics": 0.2, + "biology": 0.3, + "history": 0.26666666666666666, + "literature": 0.1, + "politics": 0.1, + "culture": 0.0, + "film": 0.2, + "law": 0.1, + "geography": 0.2 + } + }, + "prompt_4": { + "accuracy": 0.25, + "category_acc": { + "brand": 0.2, + "demographics": 0.4, + "biology": 0.2, + "history": 0.2, + "literature": 0.3, + "politics": 0.3, + "culture": 0.3, + "film": 0.2, + "law": 0.2, + "geography": 0.3 + } + }, + "prompt_5": { + "accuracy": 0.22, + "category_acc": { + "brand": 0.1, + "demographics": 0.6, + "biology": 0.0, + "history": 0.13333333333333333, + "literature": 0.2, + "politics": 0.5, + "culture": 0.2, + "film": 0.3, + "law": 0.2, + "geography": 0.2 + } + } }, "sing2eng": { "prompt_1": { @@ -99262,9 +100264,15 @@ "prompt_2": { "bleu_score": 0.05391560916890579 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_3": { + "bleu_score": 0.054357373364412875 + }, + "prompt_4": { + "bleu_score": 0.05136450032379191 + }, + "prompt_5": { + "bleu_score": 0.04088471984093496 + } }, "flores_ind2eng": { "prompt_1": { @@ -99273,10 +100281,16 @@ "prompt_2": { "bleu_score": 0.06506413192248581 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, + "prompt_3": { + "bleu_score": 0.0643938274362041 + }, + "prompt_4": { + "bleu_score": 0.11739459724583577 + }, + "prompt_5": { + "bleu_score": 0.036427882144920314 + } + }, "flores_vie2eng": { "prompt_1": { "bleu_score": 0.06846703826242148 @@ -99284,9 +100298,15 @@ "prompt_2": { "bleu_score": 0.051860041446938294 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_3": { + "bleu_score": 0.05265622408594447 + }, + "prompt_4": { + "bleu_score": 0.09344610758006208 + }, + "prompt_5": { + "bleu_score": 0.04748212974698807 + } }, "flores_zho2eng": { "prompt_1": { @@ -99295,9 +100315,15 @@ "prompt_2": { "bleu_score": 0.04908898844339031 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_3": { + "bleu_score": 0.0525572200769723 + }, + "prompt_4": { + "bleu_score": 0.09421045349075516 + }, + "prompt_5": { + "bleu_score": 0.04837140400422949 + } }, "flores_zsm2eng": { "prompt_1": { @@ -99306,9 +100332,15 @@ "prompt_2": { "bleu_score": 0.06748854822647792 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_3": { + "bleu_score": 0.06545876293952489 + }, + "prompt_4": { + "bleu_score": 0.12308563446237895 + }, + "prompt_5": { + "bleu_score": 0.02796179524686788 + } }, "mmlu": { "prompt_1": { @@ -99317,419 +100349,1321 @@ "prompt_2": { "accuracy": 0.2998833138856476 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "mmlu_full": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "c_eval": { - "prompt_1": { - "accuracy": 0.24888558692421991 + "prompt_3": { + "accuracy": 0.26837806301050177 }, - "prompt_2": { - "accuracy": 0.24219910846953938 + "prompt_4": { + "accuracy": 0.2660443407234539 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_5": { + "accuracy": 0.2602100350058343 + } }, - "c_eval_full": { + "mmlu_full": { "prompt_1": { - "accuracy": 0.24346201743462018, + "accuracy": 0.27415087593850557, "category_acc": { - "computer_network": 0.041666666666666664, - "operating_system": 0.375, - "computer_architecture": 0.2692307692307692, - "college_programming": 0.19047619047619047, - "college_physics": 0.08333333333333333, - "college_chemistry": 0.20689655172413793, - "advanced_mathematics": 0.2916666666666667, - "probability_and_statistics": 0.17391304347826086, - "discrete_mathematics": 0.2857142857142857, - "electrical_engineer": 0.3333333333333333, - "metrology_engineer": 0.20689655172413793, - "high_school_mathematics": 0.30434782608695654, - "high_school_physics": 0.2916666666666667, - "high_school_chemistry": 0.16666666666666666, - "high_school_biology": 0.375, - "middle_school_mathematics": 0.20833333333333334, - "middle_school_biology": 0.11538461538461539, - "middle_school_physics": 0.20833333333333334, - "middle_school_chemistry": 0.2, - "veterinary_medicine": 0.14285714285714285, - "college_economics": 0.2833333333333333, - "business_administration": 0.3157894736842105, - "marxism": 0.2916666666666667, - "mao_zedong_thought": 0.27586206896551724, - "education_science": 0.3235294117647059, - "teacher_qualification": 0.16326530612244897, - "high_school_politics": 0.3333333333333333, - "high_school_geography": 0.2916666666666667, - "middle_school_politics": 0.34615384615384615, - "middle_school_geography": 0.11764705882352941, - "modern_chinese_history": 0.25, - "ideological_and_moral_cultivation": 0.20833333333333334, - "logic": 0.2222222222222222, - "law": 0.13793103448275862, - "chinese_language_and_literature": 0.2857142857142857, - "art_studies": 0.3157894736842105, - "professional_tour_guide": 0.35294117647058826, - "legal_professional": 0.10714285714285714, - "high_school_chinese": 0.125, - "high_school_history": 0.36, - "middle_school_history": 0.18518518518518517, - "civil_servant": 0.25, - "sports_science": 0.2916666666666667, - "plant_protection": 0.2962962962962963, - "basic_medicine": 0.375, - "clinical_medicine": 0.2222222222222222, - "urban_and_rural_planner": 0.3333333333333333, - "accountant": 0.16666666666666666, - "fire_engineer": 0.16666666666666666, - "environmental_impact_assessment_engineer": 0.2222222222222222, - "tax_accountant": 0.2222222222222222, - "physician": 0.25925925925925924 + "high_school_european_history": 0.25, + "business_ethics": 0.18181818181818182, + "clinical_knowledge": 0.25, + "medical_genetics": 0.32323232323232326, + "high_school_us_history": 0.270935960591133, + "high_school_physics": 0.28, + "high_school_world_history": 0.21610169491525424, + "virology": 0.30303030303030304, + "high_school_microeconomics": 0.24050632911392406, + "econometrics": 0.2831858407079646, + "college_computer_science": 0.2727272727272727, + "high_school_biology": 0.2815533980582524, + "abstract_algebra": 0.18181818181818182, + "professional_accounting": 0.24199288256227758, + "philosophy": 0.26129032258064516, + "professional_medicine": 0.34317343173431736, + "nutrition": 0.3114754098360656, + "global_facts": 0.29292929292929293, + "machine_learning": 0.21621621621621623, + "security_studies": 0.26229508196721313, + "public_relations": 0.30275229357798167, + "professional_psychology": 0.24877250409165302, + "prehistory": 0.30959752321981426, + "anatomy": 0.29850746268656714, + "human_sexuality": 0.33076923076923076, + "college_medicine": 0.19767441860465115, + "high_school_government_and_politics": 0.359375, + "college_chemistry": 0.16161616161616163, + "logical_fallacies": 0.2777777777777778, + "high_school_geography": 0.3604060913705584, + "elementary_mathematics": 0.2519893899204244, + "human_aging": 0.25675675675675674, + "college_mathematics": 0.30303030303030304, + "high_school_psychology": 0.27941176470588236, + "formal_logic": 0.296, + "high_school_statistics": 0.32558139534883723, + "international_law": 0.3416666666666667, + "high_school_mathematics": 0.27137546468401486, + "high_school_computer_science": 0.30303030303030304, + "conceptual_physics": 0.27350427350427353, + "miscellaneous": 0.26342710997442453, + "high_school_chemistry": 0.25742574257425743, + "marketing": 0.2446351931330472, + "professional_law": 0.28636660143509457, + "management": 0.29411764705882354, + "college_physics": 0.21782178217821782, + "jurisprudence": 0.2803738317757009, + "world_religions": 0.29411764705882354, + "sociology": 0.24, + "us_foreign_policy": 0.25252525252525254, + "high_school_macroeconomics": 0.2776349614395887, + "computer_security": 0.2828282828282828, + "moral_scenarios": 0.24720357941834453, + "moral_disputes": 0.28405797101449276, + "electrical_engineering": 0.2916666666666667, + "astronomy": 0.2980132450331126, + "college_biology": 0.35664335664335667 } }, "prompt_2": { - "accuracy": 0.25093399750934, + "accuracy": 0.26428316052913836, "category_acc": { - "computer_network": 0.08333333333333333, - "operating_system": 0.2916666666666667, - "computer_architecture": 0.23076923076923078, - "college_programming": 0.16666666666666666, - "college_physics": 0.125, - "college_chemistry": 0.27586206896551724, - "advanced_mathematics": 0.25, - "probability_and_statistics": 0.17391304347826086, - "discrete_mathematics": 0.3333333333333333, - "electrical_engineer": 0.2619047619047619, - "metrology_engineer": 0.20689655172413793, - "high_school_mathematics": 0.34782608695652173, - "high_school_physics": 0.16666666666666666, - "high_school_chemistry": 0.25, - "high_school_biology": 0.3333333333333333, - "middle_school_mathematics": 0.25, - "middle_school_biology": 0.38461538461538464, - "middle_school_physics": 0.20833333333333334, - "middle_school_chemistry": 0.16, - "veterinary_medicine": 0.25, - "college_economics": 0.25, - "business_administration": 0.23684210526315788, - "marxism": 0.25, - "mao_zedong_thought": 0.27586206896551724, - "education_science": 0.29411764705882354, - "teacher_qualification": 0.2857142857142857, - "high_school_politics": 0.20833333333333334, - "high_school_geography": 0.125, - "middle_school_politics": 0.2692307692307692, - "middle_school_geography": 0.23529411764705882, - "modern_chinese_history": 0.25, - "ideological_and_moral_cultivation": 0.125, - "logic": 0.18518518518518517, - "law": 0.2413793103448276, - "chinese_language_and_literature": 0.2857142857142857, - "art_studies": 0.3157894736842105, - "professional_tour_guide": 0.38235294117647056, - "legal_professional": 0.07142857142857142, - "high_school_chinese": 0.125, - "high_school_history": 0.32, - "middle_school_history": 0.37037037037037035, - "civil_servant": 0.19230769230769232, - "sports_science": 0.5, - "plant_protection": 0.18518518518518517, - "basic_medicine": 0.4166666666666667, - "clinical_medicine": 0.2962962962962963, - "urban_and_rural_planner": 0.37254901960784315, - "accountant": 0.18518518518518517, - "fire_engineer": 0.25, - "environmental_impact_assessment_engineer": 0.19444444444444445, - "tax_accountant": 0.24074074074074073, - "physician": 0.2962962962962963 + "high_school_european_history": 0.2804878048780488, + "business_ethics": 0.25252525252525254, + "clinical_knowledge": 0.2689393939393939, + "medical_genetics": 0.2222222222222222, + "high_school_us_history": 0.24630541871921183, + "high_school_physics": 0.26666666666666666, + "high_school_world_history": 0.25, + "virology": 0.21212121212121213, + "high_school_microeconomics": 0.2869198312236287, + "econometrics": 0.22123893805309736, + "college_computer_science": 0.16161616161616163, + "high_school_biology": 0.2750809061488673, + "abstract_algebra": 0.23232323232323232, + "professional_accounting": 0.2597864768683274, + "philosophy": 0.24838709677419354, + "professional_medicine": 0.3062730627306273, + "nutrition": 0.29180327868852457, + "global_facts": 0.18181818181818182, + "machine_learning": 0.32432432432432434, + "security_studies": 0.3360655737704918, + "public_relations": 0.26605504587155965, + "professional_psychology": 0.24713584288052373, + "prehistory": 0.21981424148606812, + "anatomy": 0.20149253731343283, + "human_sexuality": 0.26153846153846155, + "college_medicine": 0.2441860465116279, + "high_school_government_and_politics": 0.34375, + "college_chemistry": 0.26262626262626265, + "logical_fallacies": 0.2716049382716049, + "high_school_geography": 0.3248730964467005, + "elementary_mathematics": 0.23607427055702918, + "human_aging": 0.1981981981981982, + "college_mathematics": 0.26262626262626265, + "high_school_psychology": 0.35294117647058826, + "formal_logic": 0.312, + "high_school_statistics": 0.33488372093023255, + "international_law": 0.21666666666666667, + "high_school_mathematics": 0.2342007434944238, + "high_school_computer_science": 0.2222222222222222, + "conceptual_physics": 0.2564102564102564, + "miscellaneous": 0.23273657289002558, + "high_school_chemistry": 0.2623762376237624, + "marketing": 0.22746781115879827, + "professional_law": 0.2831050228310502, + "management": 0.3431372549019608, + "college_physics": 0.21782178217821782, + "jurisprudence": 0.24299065420560748, + "world_religions": 0.21764705882352942, + "sociology": 0.285, + "us_foreign_policy": 0.29292929292929293, + "high_school_macroeconomics": 0.2827763496143959, + "computer_security": 0.21212121212121213, + "moral_scenarios": 0.24496644295302014, + "moral_disputes": 0.2463768115942029, + "electrical_engineering": 0.2916666666666667, + "astronomy": 0.271523178807947, + "college_biology": 0.27972027972027974 } }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "cmmlu": { - "prompt_1": { - "accuracy": 0.2903225806451613 - }, - "prompt_2": { - "accuracy": 0.2903225806451613 - }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "cmmlu_full": { - "prompt_1": { - "accuracy": 0.2562597133482991, + "prompt_3": { + "accuracy": 0.26671433678941725, "category_acc": { - "agronomy": 0.23076923076923078, - "anatomy": 0.27702702702702703, - "ancient_chinese": 0.25, - "arts": 0.25625, - "astronomy": 0.2545454545454545, - "business_ethics": 0.2535885167464115, - "chinese_civil_service_exam": 0.24375, - "chinese_driving_rule": 0.2824427480916031, - "chinese_food_culture": 0.25735294117647056, - "chinese_foreign_policy": 0.2523364485981308, - "chinese_history": 0.25696594427244585, - "chinese_literature": 0.2647058823529412, - "chinese_teacher_qualification": 0.2681564245810056, - "clinical_knowledge": 0.2489451476793249, - "college_actuarial_science": 0.24528301886792453, - "college_education": 0.29906542056074764, - "college_engineering_hydrology": 0.22641509433962265, - "college_law": 0.24074074074074073, - "college_mathematics": 0.20952380952380953, - "college_medical_statistics": 0.25471698113207547, - "college_medicine": 0.25274725274725274, - "computer_science": 0.2696078431372549, - "computer_security": 0.23391812865497075, - "conceptual_physics": 0.2653061224489796, - "construction_project_management": 0.2589928057553957, - "economics": 0.25157232704402516, - "education": 0.25766871165644173, - "electrical_engineering": 0.2616279069767442, - "elementary_chinese": 0.2777777777777778, - "elementary_commonsense": 0.23737373737373738, - "elementary_information_and_technology": 0.27310924369747897, - "elementary_mathematics": 0.28695652173913044, - "ethnology": 0.26666666666666666, - "food_science": 0.27972027972027974, - "genetics": 0.2556818181818182, - "global_facts": 0.2483221476510067, - "high_school_biology": 0.28402366863905326, - "high_school_chemistry": 0.2803030303030303, - "high_school_geography": 0.2457627118644068, - "high_school_mathematics": 0.27439024390243905, - "high_school_physics": 0.23636363636363636, - "high_school_politics": 0.23776223776223776, - "human_sexuality": 0.23809523809523808, - "international_law": 0.22702702702702704, - "journalism": 0.23837209302325582, - "jurisprudence": 0.25790754257907544, - "legal_and_moral_basis": 0.2850467289719626, - "logical": 0.2845528455284553, - "machine_learning": 0.22950819672131148, - "management": 0.2571428571428571, - "marketing": 0.2111111111111111, - "marxist_theory": 0.25396825396825395, - "modern_chinese": 0.21551724137931033, - "nutrition": 0.27586206896551724, - "philosophy": 0.24761904761904763, - "professional_accounting": 0.26285714285714284, - "professional_law": 0.25118483412322273, - "professional_medicine": 0.2632978723404255, - "professional_psychology": 0.27155172413793105, - "public_relations": 0.28160919540229884, - "security_study": 0.2814814814814815, - "sociology": 0.23893805309734514, - "sports_science": 0.23030303030303031, - "traditional_chinese_medicine": 0.20540540540540542, - "virology": 0.27218934911242604, - "world_history": 0.21739130434782608, - "world_religions": 0.3 + "high_school_european_history": 0.2804878048780488, + "business_ethics": 0.24242424242424243, + "clinical_knowledge": 0.2765151515151515, + "medical_genetics": 0.2828282828282828, + "high_school_us_history": 0.24630541871921183, + "high_school_physics": 0.30666666666666664, + "high_school_world_history": 0.25, + "virology": 0.20606060606060606, + "high_school_microeconomics": 0.31645569620253167, + "econometrics": 0.3008849557522124, + "college_computer_science": 0.1919191919191919, + "high_school_biology": 0.3074433656957929, + "abstract_algebra": 0.2222222222222222, + "professional_accounting": 0.24555160142348753, + "philosophy": 0.26129032258064516, + "professional_medicine": 0.4022140221402214, + "nutrition": 0.28524590163934427, + "global_facts": 0.21212121212121213, + "machine_learning": 0.24324324324324326, + "security_studies": 0.36065573770491804, + "public_relations": 0.23853211009174313, + "professional_psychology": 0.2225859247135843, + "prehistory": 0.2476780185758514, + "anatomy": 0.17164179104477612, + "human_sexuality": 0.3, + "college_medicine": 0.29069767441860467, + "high_school_government_and_politics": 0.3697916666666667, + "college_chemistry": 0.24242424242424243, + "logical_fallacies": 0.24691358024691357, + "high_school_geography": 0.3197969543147208, + "elementary_mathematics": 0.246684350132626, + "human_aging": 0.1891891891891892, + "college_mathematics": 0.3434343434343434, + "high_school_psychology": 0.3180147058823529, + "formal_logic": 0.312, + "high_school_statistics": 0.35348837209302325, + "international_law": 0.175, + "high_school_mathematics": 0.23048327137546468, + "high_school_computer_science": 0.2222222222222222, + "conceptual_physics": 0.28205128205128205, + "miscellaneous": 0.2289002557544757, + "high_school_chemistry": 0.2623762376237624, + "marketing": 0.19313304721030042, + "professional_law": 0.2720156555772994, + "management": 0.3137254901960784, + "college_physics": 0.31683168316831684, + "jurisprudence": 0.205607476635514, + "world_religions": 0.18823529411764706, + "sociology": 0.23, + "us_foreign_policy": 0.29292929292929293, + "high_school_macroeconomics": 0.3393316195372751, + "computer_security": 0.21212121212121213, + "moral_scenarios": 0.24496644295302014, + "moral_disputes": 0.2492753623188406, + "electrical_engineering": 0.22916666666666666, + "astronomy": 0.2913907284768212, + "college_biology": 0.2867132867132867 } }, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "zbench": { - "prompt_1": { - "accuracy": 0.2727272727272727 - }, - "prompt_2": { - "accuracy": 0.24242424242424243 - }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "ind_emotion": { - "prompt_1": { - "accuracy": 0.17272727272727273 - }, - "prompt_2": { - "accuracy": 0.11818181818181818 + "prompt_4": { + "accuracy": 0.27143367894172327, + "category_acc": { + "high_school_european_history": 0.24390243902439024, + "business_ethics": 0.30303030303030304, + "clinical_knowledge": 0.25, + "medical_genetics": 0.30303030303030304, + "high_school_us_history": 0.2660098522167488, + "high_school_physics": 0.24, + "high_school_world_history": 0.2584745762711864, + "virology": 0.3212121212121212, + "high_school_microeconomics": 0.2911392405063291, + "econometrics": 0.2920353982300885, + "college_computer_science": 0.21212121212121213, + "high_school_biology": 0.3074433656957929, + "abstract_algebra": 0.1919191919191919, + "professional_accounting": 0.23487544483985764, + "philosophy": 0.2967741935483871, + "professional_medicine": 0.31365313653136534, + "nutrition": 0.3180327868852459, + "global_facts": 0.2828282828282828, + "machine_learning": 0.23423423423423423, + "security_studies": 0.3155737704918033, + "public_relations": 0.3394495412844037, + "professional_psychology": 0.2733224222585925, + "prehistory": 0.28173374613003094, + "anatomy": 0.20149253731343283, + "human_sexuality": 0.2846153846153846, + "college_medicine": 0.23837209302325582, + "high_school_government_and_politics": 0.3697916666666667, + "college_chemistry": 0.1919191919191919, + "logical_fallacies": 0.24074074074074073, + "high_school_geography": 0.3197969543147208, + "elementary_mathematics": 0.246684350132626, + "human_aging": 0.24324324324324326, + "college_mathematics": 0.24242424242424243, + "high_school_psychology": 0.29411764705882354, + "formal_logic": 0.264, + "high_school_statistics": 0.3023255813953488, + "international_law": 0.25, + "high_school_mathematics": 0.26022304832713755, + "high_school_computer_science": 0.21212121212121213, + "conceptual_physics": 0.3076923076923077, + "miscellaneous": 0.2749360613810742, + "high_school_chemistry": 0.25742574257425743, + "marketing": 0.2446351931330472, + "professional_law": 0.25962165688193084, + "management": 0.2549019607843137, + "college_physics": 0.18811881188118812, + "jurisprudence": 0.308411214953271, + "world_religions": 0.27058823529411763, + "sociology": 0.325, + "us_foreign_policy": 0.2727272727272727, + "high_school_macroeconomics": 0.2827763496143959, + "computer_security": 0.2828282828282828, + "moral_scenarios": 0.24384787472035793, + "moral_disputes": 0.25507246376811593, + "electrical_engineering": 0.2777777777777778, + "astronomy": 0.271523178807947, + "college_biology": 0.2867132867132867 + } }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_5": { + "accuracy": 0.2729352878083661, + "category_acc": { + "high_school_european_history": 0.1951219512195122, + "business_ethics": 0.2222222222222222, + "clinical_knowledge": 0.2878787878787879, + "medical_genetics": 0.2828282828282828, + "high_school_us_history": 0.2857142857142857, + "high_school_physics": 0.29333333333333333, + "high_school_world_history": 0.21610169491525424, + "virology": 0.26666666666666666, + "high_school_microeconomics": 0.2911392405063291, + "econometrics": 0.2831858407079646, + "college_computer_science": 0.2222222222222222, + "high_school_biology": 0.3300970873786408, + "abstract_algebra": 0.25252525252525254, + "professional_accounting": 0.23487544483985764, + "philosophy": 0.27741935483870966, + "professional_medicine": 0.3247232472324723, + "nutrition": 0.3344262295081967, + "global_facts": 0.23232323232323232, + "machine_learning": 0.22522522522522523, + "security_studies": 0.36065573770491804, + "public_relations": 0.3669724770642202, + "professional_psychology": 0.24877250409165302, + "prehistory": 0.29102167182662536, + "anatomy": 0.2537313432835821, + "human_sexuality": 0.3153846153846154, + "college_medicine": 0.23255813953488372, + "high_school_government_and_politics": 0.3541666666666667, + "college_chemistry": 0.2222222222222222, + "logical_fallacies": 0.22839506172839505, + "high_school_geography": 0.34517766497461927, + "elementary_mathematics": 0.2546419098143236, + "human_aging": 0.19369369369369369, + "college_mathematics": 0.2727272727272727, + "high_school_psychology": 0.3272058823529412, + "formal_logic": 0.296, + "high_school_statistics": 0.3116279069767442, + "international_law": 0.25833333333333336, + "high_school_mathematics": 0.24907063197026022, + "high_school_computer_science": 0.23232323232323232, + "conceptual_physics": 0.26495726495726496, + "miscellaneous": 0.26342710997442453, + "high_school_chemistry": 0.25742574257425743, + "marketing": 0.2832618025751073, + "professional_law": 0.26353555120678407, + "management": 0.35294117647058826, + "college_physics": 0.1782178217821782, + "jurisprudence": 0.2523364485981308, + "world_religions": 0.2823529411764706, + "sociology": 0.28, + "us_foreign_policy": 0.24242424242424243, + "high_school_macroeconomics": 0.30848329048843187, + "computer_security": 0.2828282828282828, + "moral_scenarios": 0.2225950782997763, + "moral_disputes": 0.26666666666666666, + "electrical_engineering": 0.2847222222222222, + "astronomy": 0.33774834437086093, + "college_biology": 0.2727272727272727 + } + } }, - "ocnli": { + "c_eval": { "prompt_1": { - "accuracy": 0.3227118644067797 + "accuracy": 0.24888558692421991 }, "prompt_2": { - "accuracy": 0.3264406779661017 - }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "c3": { - "prompt_1": { - "accuracy": 0.281226626776365 + "accuracy": 0.24219910846953938 }, - "prompt_2": { - "accuracy": 0.2737471952131638 + "prompt_3": { + "accuracy": 0.24219910846953938 }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "dream": { - "prompt_1": { - "accuracy": 0.3258206761391475 + "prompt_4": { + "accuracy": 0.23402674591381872 }, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_5": { + "accuracy": 0.24962852897473997 + } }, - "samsum": { + "c_eval_full": { "prompt_1": { - "rouge1": 0.21071721731902832, - "rouge2": 0.06263208088632324, - "rougeL": 0.15873185260050474, - "avg_rouge": 0.14402705026861876 - }, - "prompt_2": { - "rouge1": 0.17477087124858154, - "rouge2": 0.05161058151166577, - "rougeL": 0.13780980003856205, - "avg_rouge": 0.12139708426626979 - }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "dialogsum": { - "prompt_1": { - "rouge1": 0.2095880648481376, - "rouge2": 0.05886625847491587, - "rougeL": 0.15351111164357276, - "avg_rouge": 0.1406551449888754 - }, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "sst2": { - "prompt_1": { - "accuracy": 0.5034403669724771 + "accuracy": 0.24346201743462018, + "category_acc": { + "computer_network": 0.041666666666666664, + "operating_system": 0.375, + "computer_architecture": 0.2692307692307692, + "college_programming": 0.19047619047619047, + "college_physics": 0.08333333333333333, + "college_chemistry": 0.20689655172413793, + "advanced_mathematics": 0.2916666666666667, + "probability_and_statistics": 0.17391304347826086, + "discrete_mathematics": 0.2857142857142857, + "electrical_engineer": 0.3333333333333333, + "metrology_engineer": 0.20689655172413793, + "high_school_mathematics": 0.30434782608695654, + "high_school_physics": 0.2916666666666667, + "high_school_chemistry": 0.16666666666666666, + "high_school_biology": 0.375, + "middle_school_mathematics": 0.20833333333333334, + "middle_school_biology": 0.11538461538461539, + "middle_school_physics": 0.20833333333333334, + "middle_school_chemistry": 0.2, + "veterinary_medicine": 0.14285714285714285, + "college_economics": 0.2833333333333333, + "business_administration": 0.3157894736842105, + "marxism": 0.2916666666666667, + "mao_zedong_thought": 0.27586206896551724, + "education_science": 0.3235294117647059, + "teacher_qualification": 0.16326530612244897, + "high_school_politics": 0.3333333333333333, + "high_school_geography": 0.2916666666666667, + "middle_school_politics": 0.34615384615384615, + "middle_school_geography": 0.11764705882352941, + "modern_chinese_history": 0.25, + "ideological_and_moral_cultivation": 0.20833333333333334, + "logic": 0.2222222222222222, + "law": 0.13793103448275862, + "chinese_language_and_literature": 0.2857142857142857, + "art_studies": 0.3157894736842105, + "professional_tour_guide": 0.35294117647058826, + "legal_professional": 0.10714285714285714, + "high_school_chinese": 0.125, + "high_school_history": 0.36, + "middle_school_history": 0.18518518518518517, + "civil_servant": 0.25, + "sports_science": 0.2916666666666667, + "plant_protection": 0.2962962962962963, + "basic_medicine": 0.375, + "clinical_medicine": 0.2222222222222222, + "urban_and_rural_planner": 0.3333333333333333, + "accountant": 0.16666666666666666, + "fire_engineer": 0.16666666666666666, + "environmental_impact_assessment_engineer": 0.2222222222222222, + "tax_accountant": 0.2222222222222222, + "physician": 0.25925925925925924 + } }, "prompt_2": { - "accuracy": 0.4805045871559633 + "accuracy": 0.25093399750934, + "category_acc": { + "computer_network": 0.08333333333333333, + "operating_system": 0.2916666666666667, + "computer_architecture": 0.23076923076923078, + "college_programming": 0.16666666666666666, + "college_physics": 0.125, + "college_chemistry": 0.27586206896551724, + "advanced_mathematics": 0.25, + "probability_and_statistics": 0.17391304347826086, + "discrete_mathematics": 0.3333333333333333, + "electrical_engineer": 0.2619047619047619, + "metrology_engineer": 0.20689655172413793, + "high_school_mathematics": 0.34782608695652173, + "high_school_physics": 0.16666666666666666, + "high_school_chemistry": 0.25, + "high_school_biology": 0.3333333333333333, + "middle_school_mathematics": 0.25, + "middle_school_biology": 0.38461538461538464, + "middle_school_physics": 0.20833333333333334, + "middle_school_chemistry": 0.16, + "veterinary_medicine": 0.25, + "college_economics": 0.25, + "business_administration": 0.23684210526315788, + "marxism": 0.25, + "mao_zedong_thought": 0.27586206896551724, + "education_science": 0.29411764705882354, + "teacher_qualification": 0.2857142857142857, + "high_school_politics": 0.20833333333333334, + "high_school_geography": 0.125, + "middle_school_politics": 0.2692307692307692, + "middle_school_geography": 0.23529411764705882, + "modern_chinese_history": 0.25, + "ideological_and_moral_cultivation": 0.125, + "logic": 0.18518518518518517, + "law": 0.2413793103448276, + "chinese_language_and_literature": 0.2857142857142857, + "art_studies": 0.3157894736842105, + "professional_tour_guide": 0.38235294117647056, + "legal_professional": 0.07142857142857142, + "high_school_chinese": 0.125, + "high_school_history": 0.32, + "middle_school_history": 0.37037037037037035, + "civil_servant": 0.19230769230769232, + "sports_science": 0.5, + "plant_protection": 0.18518518518518517, + "basic_medicine": 0.4166666666666667, + "clinical_medicine": 0.2962962962962963, + "urban_and_rural_planner": 0.37254901960784315, + "accountant": 0.18518518518518517, + "fire_engineer": 0.25, + "environmental_impact_assessment_engineer": 0.19444444444444445, + "tax_accountant": 0.24074074074074073, + "physician": 0.2962962962962963 + } }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "cola": { - "prompt_1": { - "accuracy": 0.5043144774688398 + "prompt_3": { + "accuracy": 0.24906600249066002, + "category_acc": { + "computer_network": 0.125, + "operating_system": 0.2916666666666667, + "computer_architecture": 0.38461538461538464, + "college_programming": 0.16666666666666666, + "college_physics": 0.16666666666666666, + "college_chemistry": 0.20689655172413793, + "advanced_mathematics": 0.2916666666666667, + "probability_and_statistics": 0.17391304347826086, + "discrete_mathematics": 0.3333333333333333, + "electrical_engineer": 0.2857142857142857, + "metrology_engineer": 0.2413793103448276, + "high_school_mathematics": 0.391304347826087, + "high_school_physics": 0.20833333333333334, + "high_school_chemistry": 0.25, + "high_school_biology": 0.3333333333333333, + "middle_school_mathematics": 0.25, + "middle_school_biology": 0.2692307692307692, + "middle_school_physics": 0.125, + "middle_school_chemistry": 0.12, + "veterinary_medicine": 0.17857142857142858, + "college_economics": 0.26666666666666666, + "business_administration": 0.18421052631578946, + "marxism": 0.2916666666666667, + "mao_zedong_thought": 0.2413793103448276, + "education_science": 0.29411764705882354, + "teacher_qualification": 0.30612244897959184, + "high_school_politics": 0.25, + "high_school_geography": 0.125, + "middle_school_politics": 0.15384615384615385, + "middle_school_geography": 0.17647058823529413, + "modern_chinese_history": 0.21428571428571427, + "ideological_and_moral_cultivation": 0.20833333333333334, + "logic": 0.18518518518518517, + "law": 0.1724137931034483, + "chinese_language_and_literature": 0.35714285714285715, + "art_studies": 0.34210526315789475, + "professional_tour_guide": 0.35294117647058826, + "legal_professional": 0.14285714285714285, + "high_school_chinese": 0.125, + "high_school_history": 0.36, + "middle_school_history": 0.3333333333333333, + "civil_servant": 0.2692307692307692, + "sports_science": 0.3333333333333333, + "plant_protection": 0.2222222222222222, + "basic_medicine": 0.4166666666666667, + "clinical_medicine": 0.25925925925925924, + "urban_and_rural_planner": 0.29411764705882354, + "accountant": 0.2037037037037037, + "fire_engineer": 0.2777777777777778, + "environmental_impact_assessment_engineer": 0.2777777777777778, + "tax_accountant": 0.2222222222222222, + "physician": 0.2222222222222222 + } }, - "prompt_2": { - "accuracy": 0.5349952061361457 + "prompt_4": { + "accuracy": 0.2465753424657534, + "category_acc": { + "computer_network": 0.08333333333333333, + "operating_system": 0.25, + "computer_architecture": 0.2692307692307692, + "college_programming": 0.19047619047619047, + "college_physics": 0.16666666666666666, + "college_chemistry": 0.3448275862068966, + "advanced_mathematics": 0.2916666666666667, + "probability_and_statistics": 0.17391304347826086, + "discrete_mathematics": 0.14285714285714285, + "electrical_engineer": 0.2857142857142857, + "metrology_engineer": 0.27586206896551724, + "high_school_mathematics": 0.30434782608695654, + "high_school_physics": 0.375, + "high_school_chemistry": 0.3333333333333333, + "high_school_biology": 0.20833333333333334, + "middle_school_mathematics": 0.20833333333333334, + "middle_school_biology": 0.34615384615384615, + "middle_school_physics": 0.3333333333333333, + "middle_school_chemistry": 0.08, + "veterinary_medicine": 0.2857142857142857, + "college_economics": 0.25, + "business_administration": 0.2631578947368421, + "marxism": 0.4166666666666667, + "mao_zedong_thought": 0.27586206896551724, + "education_science": 0.3235294117647059, + "teacher_qualification": 0.1836734693877551, + "high_school_politics": 0.2916666666666667, + "high_school_geography": 0.4583333333333333, + "middle_school_politics": 0.2692307692307692, + "middle_school_geography": 0.17647058823529413, + "modern_chinese_history": 0.17857142857142858, + "ideological_and_moral_cultivation": 0.20833333333333334, + "logic": 0.14814814814814814, + "law": 0.20689655172413793, + "chinese_language_and_literature": 0.2857142857142857, + "art_studies": 0.23684210526315788, + "professional_tour_guide": 0.17647058823529413, + "legal_professional": 0.03571428571428571, + "high_school_chinese": 0.125, + "high_school_history": 0.44, + "middle_school_history": 0.14814814814814814, + "civil_servant": 0.19230769230769232, + "sports_science": 0.16666666666666666, + "plant_protection": 0.2962962962962963, + "basic_medicine": 0.25, + "clinical_medicine": 0.2962962962962963, + "urban_and_rural_planner": 0.27450980392156865, + "accountant": 0.2037037037037037, + "fire_engineer": 0.4166666666666667, + "environmental_impact_assessment_engineer": 0.19444444444444445, + "tax_accountant": 0.2777777777777778, + "physician": 0.24074074074074073 + } }, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_5": { + "accuracy": 0.24719800747198006, + "category_acc": { + "computer_network": 0.08333333333333333, + "operating_system": 0.3333333333333333, + "computer_architecture": 0.2692307692307692, + "college_programming": 0.23809523809523808, + "college_physics": 0.08333333333333333, + "college_chemistry": 0.20689655172413793, + "advanced_mathematics": 0.2916666666666667, + "probability_and_statistics": 0.17391304347826086, + "discrete_mathematics": 0.3333333333333333, + "electrical_engineer": 0.2619047619047619, + "metrology_engineer": 0.13793103448275862, + "high_school_mathematics": 0.34782608695652173, + "high_school_physics": 0.20833333333333334, + "high_school_chemistry": 0.25, + "high_school_biology": 0.375, + "middle_school_mathematics": 0.20833333333333334, + "middle_school_biology": 0.19230769230769232, + "middle_school_physics": 0.16666666666666666, + "middle_school_chemistry": 0.12, + "veterinary_medicine": 0.17857142857142858, + "college_economics": 0.21666666666666667, + "business_administration": 0.3684210526315789, + "marxism": 0.20833333333333334, + "mao_zedong_thought": 0.20689655172413793, + "education_science": 0.4117647058823529, + "teacher_qualification": 0.2653061224489796, + "high_school_politics": 0.2916666666666667, + "high_school_geography": 0.16666666666666666, + "middle_school_politics": 0.34615384615384615, + "middle_school_geography": 0.11764705882352941, + "modern_chinese_history": 0.25, + "ideological_and_moral_cultivation": 0.20833333333333334, + "logic": 0.14814814814814814, + "law": 0.1724137931034483, + "chinese_language_and_literature": 0.25, + "art_studies": 0.3684210526315789, + "professional_tour_guide": 0.3235294117647059, + "legal_professional": 0.10714285714285714, + "high_school_chinese": 0.16666666666666666, + "high_school_history": 0.4, + "middle_school_history": 0.37037037037037035, + "civil_servant": 0.21153846153846154, + "sports_science": 0.16666666666666666, + "plant_protection": 0.37037037037037035, + "basic_medicine": 0.375, + "clinical_medicine": 0.25925925925925924, + "urban_and_rural_planner": 0.3137254901960784, + "accountant": 0.18518518518518517, + "fire_engineer": 0.2777777777777778, + "environmental_impact_assessment_engineer": 0.2222222222222222, + "tax_accountant": 0.2222222222222222, + "physician": 0.2777777777777778 + } + } }, - "qqp": { + "cmmlu": { "prompt_1": { - "accuracy": 0.501 + "accuracy": 0.2903225806451613 }, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "mnli": { - "prompt_1": { - "accuracy": 0.3285 + "prompt_2": { + "accuracy": 0.2903225806451613 }, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "qnli": { - "prompt_1": { - "accuracy": 0.5095 + "prompt_3": { + "accuracy": 0.31899641577060933 }, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "wnli": { - "prompt_1": { - "accuracy": 0.5211267605633803 + "prompt_4": { + "accuracy": 0.26523297491039427 }, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_5": { + "accuracy": 0.2724014336917563 + } }, - "rte": { + "cmmlu_full": { "prompt_1": { - "accuracy": 0.4981949458483754 + "accuracy": 0.2562597133482991, + "category_acc": { + "agronomy": 0.23076923076923078, + "anatomy": 0.27702702702702703, + "ancient_chinese": 0.25, + "arts": 0.25625, + "astronomy": 0.2545454545454545, + "business_ethics": 0.2535885167464115, + "chinese_civil_service_exam": 0.24375, + "chinese_driving_rule": 0.2824427480916031, + "chinese_food_culture": 0.25735294117647056, + "chinese_foreign_policy": 0.2523364485981308, + "chinese_history": 0.25696594427244585, + "chinese_literature": 0.2647058823529412, + "chinese_teacher_qualification": 0.2681564245810056, + "clinical_knowledge": 0.2489451476793249, + "college_actuarial_science": 0.24528301886792453, + "college_education": 0.29906542056074764, + "college_engineering_hydrology": 0.22641509433962265, + "college_law": 0.24074074074074073, + "college_mathematics": 0.20952380952380953, + "college_medical_statistics": 0.25471698113207547, + "college_medicine": 0.25274725274725274, + "computer_science": 0.2696078431372549, + "computer_security": 0.23391812865497075, + "conceptual_physics": 0.2653061224489796, + "construction_project_management": 0.2589928057553957, + "economics": 0.25157232704402516, + "education": 0.25766871165644173, + "electrical_engineering": 0.2616279069767442, + "elementary_chinese": 0.2777777777777778, + "elementary_commonsense": 0.23737373737373738, + "elementary_information_and_technology": 0.27310924369747897, + "elementary_mathematics": 0.28695652173913044, + "ethnology": 0.26666666666666666, + "food_science": 0.27972027972027974, + "genetics": 0.2556818181818182, + "global_facts": 0.2483221476510067, + "high_school_biology": 0.28402366863905326, + "high_school_chemistry": 0.2803030303030303, + "high_school_geography": 0.2457627118644068, + "high_school_mathematics": 0.27439024390243905, + "high_school_physics": 0.23636363636363636, + "high_school_politics": 0.23776223776223776, + "human_sexuality": 0.23809523809523808, + "international_law": 0.22702702702702704, + "journalism": 0.23837209302325582, + "jurisprudence": 0.25790754257907544, + "legal_and_moral_basis": 0.2850467289719626, + "logical": 0.2845528455284553, + "machine_learning": 0.22950819672131148, + "management": 0.2571428571428571, + "marketing": 0.2111111111111111, + "marxist_theory": 0.25396825396825395, + "modern_chinese": 0.21551724137931033, + "nutrition": 0.27586206896551724, + "philosophy": 0.24761904761904763, + "professional_accounting": 0.26285714285714284, + "professional_law": 0.25118483412322273, + "professional_medicine": 0.2632978723404255, + "professional_psychology": 0.27155172413793105, + "public_relations": 0.28160919540229884, + "security_study": 0.2814814814814815, + "sociology": 0.23893805309734514, + "sports_science": 0.23030303030303031, + "traditional_chinese_medicine": 0.20540540540540542, + "virology": 0.27218934911242604, + "world_history": 0.21739130434782608, + "world_religions": 0.3 + } }, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "mrpc": { - "prompt_1": { - "accuracy": 0.45588235294117646 + "prompt_2": { + "accuracy": 0.25979968917285445, + "category_acc": { + "agronomy": 0.25443786982248523, + "anatomy": 0.24324324324324326, + "ancient_chinese": 0.2621951219512195, + "arts": 0.28125, + "astronomy": 0.24848484848484848, + "business_ethics": 0.2822966507177033, + "chinese_civil_service_exam": 0.2125, + "chinese_driving_rule": 0.24427480916030533, + "chinese_food_culture": 0.23529411764705882, + "chinese_foreign_policy": 0.2897196261682243, + "chinese_history": 0.2631578947368421, + "chinese_literature": 0.28431372549019607, + "chinese_teacher_qualification": 0.2569832402234637, + "clinical_knowledge": 0.26582278481012656, + "college_actuarial_science": 0.24528301886792453, + "college_education": 0.2616822429906542, + "college_engineering_hydrology": 0.3018867924528302, + "college_law": 0.17592592592592593, + "college_mathematics": 0.22857142857142856, + "college_medical_statistics": 0.25471698113207547, + "college_medicine": 0.21611721611721613, + "computer_science": 0.2549019607843137, + "computer_security": 0.2222222222222222, + "conceptual_physics": 0.2857142857142857, + "construction_project_management": 0.26618705035971224, + "economics": 0.2830188679245283, + "education": 0.26993865030674846, + "electrical_engineering": 0.2616279069767442, + "elementary_chinese": 0.2698412698412698, + "elementary_commonsense": 0.23737373737373738, + "elementary_information_and_technology": 0.2689075630252101, + "elementary_mathematics": 0.2782608695652174, + "ethnology": 0.26666666666666666, + "food_science": 0.2517482517482518, + "genetics": 0.2840909090909091, + "global_facts": 0.2550335570469799, + "high_school_biology": 0.27218934911242604, + "high_school_chemistry": 0.2878787878787879, + "high_school_geography": 0.2457627118644068, + "high_school_mathematics": 0.25, + "high_school_physics": 0.24545454545454545, + "high_school_politics": 0.24475524475524477, + "human_sexuality": 0.2698412698412698, + "international_law": 0.22702702702702704, + "journalism": 0.2616279069767442, + "jurisprudence": 0.25790754257907544, + "legal_and_moral_basis": 0.2850467289719626, + "logical": 0.23577235772357724, + "machine_learning": 0.2459016393442623, + "management": 0.24761904761904763, + "marketing": 0.25555555555555554, + "marxist_theory": 0.24338624338624337, + "modern_chinese": 0.25862068965517243, + "nutrition": 0.2689655172413793, + "philosophy": 0.2571428571428571, + "professional_accounting": 0.26285714285714284, + "professional_law": 0.2796208530805687, + "professional_medicine": 0.26861702127659576, + "professional_psychology": 0.2801724137931034, + "public_relations": 0.2988505747126437, + "security_study": 0.28888888888888886, + "sociology": 0.25663716814159293, + "sports_science": 0.2727272727272727, + "traditional_chinese_medicine": 0.23243243243243245, + "virology": 0.3076923076923077, + "world_history": 0.2422360248447205, + "world_religions": 0.2375 + } }, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - } - }, - "five_shot": { - "cross_mmlu": { - "prompt_1": -1 - }, - "cross_logiqa": { - "prompt_1": -1 - }, - "sg_eval": { - "prompt_1": -1 - }, - "cn_eval": { - "prompt_1": -1 - }, - "us_eval": { - "prompt_1": -1 - }, - "ph_eval": { - "prompt_1": -1 + "prompt_3": { + "accuracy": 0.25720946295976516, + "category_acc": { + "agronomy": 0.22485207100591717, + "anatomy": 0.2702702702702703, + "ancient_chinese": 0.23780487804878048, + "arts": 0.25, + "astronomy": 0.23030303030303031, + "business_ethics": 0.27751196172248804, + "chinese_civil_service_exam": 0.21875, + "chinese_driving_rule": 0.25190839694656486, + "chinese_food_culture": 0.22794117647058823, + "chinese_foreign_policy": 0.27102803738317754, + "chinese_history": 0.2693498452012384, + "chinese_literature": 0.27450980392156865, + "chinese_teacher_qualification": 0.2737430167597765, + "clinical_knowledge": 0.2489451476793249, + "college_actuarial_science": 0.25471698113207547, + "college_education": 0.308411214953271, + "college_engineering_hydrology": 0.29245283018867924, + "college_law": 0.19444444444444445, + "college_mathematics": 0.23809523809523808, + "college_medical_statistics": 0.25471698113207547, + "college_medicine": 0.2271062271062271, + "computer_science": 0.2696078431372549, + "computer_security": 0.23976608187134502, + "conceptual_physics": 0.2789115646258503, + "construction_project_management": 0.23741007194244604, + "economics": 0.25157232704402516, + "education": 0.25766871165644173, + "electrical_engineering": 0.2558139534883721, + "elementary_chinese": 0.2698412698412698, + "elementary_commonsense": 0.25757575757575757, + "elementary_information_and_technology": 0.2773109243697479, + "elementary_mathematics": 0.2565217391304348, + "ethnology": 0.2518518518518518, + "food_science": 0.3006993006993007, + "genetics": 0.25, + "global_facts": 0.2684563758389262, + "high_school_biology": 0.24260355029585798, + "high_school_chemistry": 0.2803030303030303, + "high_school_geography": 0.22033898305084745, + "high_school_mathematics": 0.2804878048780488, + "high_school_physics": 0.2545454545454545, + "high_school_politics": 0.23776223776223776, + "human_sexuality": 0.2777777777777778, + "international_law": 0.2648648648648649, + "journalism": 0.2441860465116279, + "jurisprudence": 0.26277372262773724, + "legal_and_moral_basis": 0.2803738317757009, + "logical": 0.23577235772357724, + "machine_learning": 0.27049180327868855, + "management": 0.24761904761904763, + "marketing": 0.23333333333333334, + "marxist_theory": 0.2222222222222222, + "modern_chinese": 0.22413793103448276, + "nutrition": 0.2620689655172414, + "philosophy": 0.2571428571428571, + "professional_accounting": 0.26285714285714284, + "professional_law": 0.2559241706161137, + "professional_medicine": 0.26063829787234044, + "professional_psychology": 0.2974137931034483, + "public_relations": 0.26436781609195403, + "security_study": 0.28888888888888886, + "sociology": 0.25663716814159293, + "sports_science": 0.23636363636363636, + "traditional_chinese_medicine": 0.24324324324324326, + "virology": 0.30177514792899407, + "world_history": 0.2546583850931677, + "world_religions": 0.24375 + } + }, + "prompt_4": { + "accuracy": 0.25807287169746157, + "category_acc": { + "agronomy": 0.22485207100591717, + "anatomy": 0.24324324324324326, + "ancient_chinese": 0.2682926829268293, + "arts": 0.30625, + "astronomy": 0.3090909090909091, + "business_ethics": 0.2727272727272727, + "chinese_civil_service_exam": 0.24375, + "chinese_driving_rule": 0.2366412213740458, + "chinese_food_culture": 0.27941176470588236, + "chinese_foreign_policy": 0.27102803738317754, + "chinese_history": 0.2693498452012384, + "chinese_literature": 0.25, + "chinese_teacher_qualification": 0.24581005586592178, + "clinical_knowledge": 0.2109704641350211, + "college_actuarial_science": 0.24528301886792453, + "college_education": 0.3177570093457944, + "college_engineering_hydrology": 0.1792452830188679, + "college_law": 0.28703703703703703, + "college_mathematics": 0.17142857142857143, + "college_medical_statistics": 0.2358490566037736, + "college_medicine": 0.23809523809523808, + "computer_science": 0.23039215686274508, + "computer_security": 0.22807017543859648, + "conceptual_physics": 0.2857142857142857, + "construction_project_management": 0.20863309352517986, + "economics": 0.2389937106918239, + "education": 0.27607361963190186, + "electrical_engineering": 0.23837209302325582, + "elementary_chinese": 0.24603174603174602, + "elementary_commonsense": 0.2828282828282828, + "elementary_information_and_technology": 0.27310924369747897, + "elementary_mathematics": 0.26956521739130435, + "ethnology": 0.23703703703703705, + "food_science": 0.2937062937062937, + "genetics": 0.2784090909090909, + "global_facts": 0.2684563758389262, + "high_school_biology": 0.21893491124260356, + "high_school_chemistry": 0.2803030303030303, + "high_school_geography": 0.2542372881355932, + "high_school_mathematics": 0.25609756097560976, + "high_school_physics": 0.2545454545454545, + "high_school_politics": 0.2517482517482518, + "human_sexuality": 0.23015873015873015, + "international_law": 0.25405405405405407, + "journalism": 0.2616279069767442, + "jurisprudence": 0.2895377128953771, + "legal_and_moral_basis": 0.2897196261682243, + "logical": 0.24390243902439024, + "machine_learning": 0.29508196721311475, + "management": 0.26666666666666666, + "marketing": 0.23333333333333334, + "marxist_theory": 0.291005291005291, + "modern_chinese": 0.21551724137931033, + "nutrition": 0.21379310344827587, + "philosophy": 0.3047619047619048, + "professional_accounting": 0.32571428571428573, + "professional_law": 0.24644549763033174, + "professional_medicine": 0.2765957446808511, + "professional_psychology": 0.28448275862068967, + "public_relations": 0.28735632183908044, + "security_study": 0.2518518518518518, + "sociology": 0.22123893805309736, + "sports_science": 0.24848484848484848, + "traditional_chinese_medicine": 0.23243243243243245, + "virology": 0.22485207100591717, + "world_history": 0.22981366459627328, + "world_religions": 0.29375 + } + }, + "prompt_5": { + "accuracy": 0.2542738732515973, + "category_acc": { + "agronomy": 0.24260355029585798, + "anatomy": 0.23648648648648649, + "ancient_chinese": 0.25, + "arts": 0.25625, + "astronomy": 0.2545454545454545, + "business_ethics": 0.22488038277511962, + "chinese_civil_service_exam": 0.2625, + "chinese_driving_rule": 0.2748091603053435, + "chinese_food_culture": 0.25735294117647056, + "chinese_foreign_policy": 0.2523364485981308, + "chinese_history": 0.24458204334365324, + "chinese_literature": 0.27450980392156865, + "chinese_teacher_qualification": 0.25139664804469275, + "clinical_knowledge": 0.24472573839662448, + "college_actuarial_science": 0.2358490566037736, + "college_education": 0.24299065420560748, + "college_engineering_hydrology": 0.2358490566037736, + "college_law": 0.2222222222222222, + "college_mathematics": 0.21904761904761905, + "college_medical_statistics": 0.2358490566037736, + "college_medicine": 0.23443223443223443, + "computer_science": 0.25980392156862747, + "computer_security": 0.2222222222222222, + "conceptual_physics": 0.25170068027210885, + "construction_project_management": 0.302158273381295, + "economics": 0.24528301886792453, + "education": 0.26993865030674846, + "electrical_engineering": 0.27906976744186046, + "elementary_chinese": 0.27380952380952384, + "elementary_commonsense": 0.30303030303030304, + "elementary_information_and_technology": 0.226890756302521, + "elementary_mathematics": 0.2782608695652174, + "ethnology": 0.22962962962962963, + "food_science": 0.26573426573426573, + "genetics": 0.26704545454545453, + "global_facts": 0.2550335570469799, + "high_school_biology": 0.24260355029585798, + "high_school_chemistry": 0.2803030303030303, + "high_school_geography": 0.2627118644067797, + "high_school_mathematics": 0.25609756097560976, + "high_school_physics": 0.23636363636363636, + "high_school_politics": 0.25874125874125875, + "human_sexuality": 0.23809523809523808, + "international_law": 0.24864864864864866, + "journalism": 0.2441860465116279, + "jurisprudence": 0.25060827250608275, + "legal_and_moral_basis": 0.27102803738317754, + "logical": 0.24390243902439024, + "machine_learning": 0.27049180327868855, + "management": 0.23809523809523808, + "marketing": 0.23333333333333334, + "marxist_theory": 0.23809523809523808, + "modern_chinese": 0.25, + "nutrition": 0.27586206896551724, + "philosophy": 0.2571428571428571, + "professional_accounting": 0.2571428571428571, + "professional_law": 0.27488151658767773, + "professional_medicine": 0.2526595744680851, + "professional_psychology": 0.3017241379310345, + "public_relations": 0.3045977011494253, + "security_study": 0.2962962962962963, + "sociology": 0.252212389380531, + "sports_science": 0.22424242424242424, + "traditional_chinese_medicine": 0.21081081081081082, + "virology": 0.24260355029585798, + "world_history": 0.22981366459627328, + "world_religions": 0.28125 + } + } }, - "sing2eng": { - "prompt_1": -1 + "zbench": { + "prompt_1": { + "accuracy": 0.2727272727272727 + }, + "prompt_2": { + "accuracy": 0.24242424242424243 + }, + "prompt_3": { + "accuracy": 0.21212121212121213 + }, + "prompt_4": { + "accuracy": 0.15151515151515152 + }, + "prompt_5": { + "accuracy": 0.18181818181818182 + } }, - "flores_ind2eng": { - "prompt_1": -1 + "ind_emotion": { + "prompt_1": { + "accuracy": 0.17272727272727273 + }, + "prompt_2": { + "accuracy": 0.11818181818181818 + }, + "prompt_3": { + "accuracy": 0.11136363636363636 + }, + "prompt_4": { + "accuracy": 0.14545454545454545 + }, + "prompt_5": { + "accuracy": 0.16363636363636364 + } + }, + "ocnli": { + "prompt_1": { + "accuracy": 0.3227118644067797 + }, + "prompt_2": { + "accuracy": 0.3264406779661017 + }, + "prompt_3": { + "accuracy": 0.3325423728813559 + }, + "prompt_4": { + "accuracy": 0.3345762711864407 + }, + "prompt_5": { + "accuracy": 0.31966101694915255 + } + }, + "c3": { + "prompt_1": { + "accuracy": 0.281226626776365 + }, + "prompt_2": { + "accuracy": 0.2737471952131638 + }, + "prompt_3": { + "accuracy": 0.2763649962602842 + }, + "prompt_4": { + "accuracy": 0.27599102468212416 + }, + "prompt_5": { + "accuracy": 0.2894540014958863 + } + }, + "dream": { + "prompt_1": { + "accuracy": 0.3258206761391475 + }, + "prompt_2": { + "accuracy": 0.32533072023517884 + }, + "prompt_3": { + "accuracy": 0.32435080842724157 + }, + "prompt_4": { + "accuracy": 0.3375796178343949 + }, + "prompt_5": { + "accuracy": 0.33219010289073986 + } + }, + "samsum": { + "prompt_1": { + "rouge1": 0.21071721731902832, + "rouge2": 0.06263208088632324, + "rougeL": 0.15873185260050474, + "avg_rouge": 0.14402705026861876 + }, + "prompt_2": { + "rouge1": 0.17477087124858154, + "rouge2": 0.05161058151166577, + "rougeL": 0.13780980003856205, + "avg_rouge": 0.12139708426626979 + }, + "prompt_3": { + "rouge1": 0.17836046630353639, + "rouge2": 0.05234500957536136, + "rougeL": 0.1420107915086774, + "avg_rouge": 0.12423875579585837 + }, + "prompt_4": { + "rouge1": 0.20612700441253587, + "rouge2": 0.06196281775998205, + "rougeL": 0.157844152573879, + "avg_rouge": 0.1419779915821323 + }, + "prompt_5": { + "rouge1": 0.19232902309680353, + "rouge2": 0.056889868658432464, + "rougeL": 0.1471035576604194, + "avg_rouge": 0.1321074831385518 + } + }, + "dialogsum": { + "prompt_1": { + "rouge1": 0.2095880648481376, + "rouge2": 0.05886625847491587, + "rougeL": 0.15351111164357276, + "avg_rouge": 0.1406551449888754 + }, + "prompt_2": { + "rouge1": 0.21071351003968464, + "rouge2": 0.05844581658785653, + "rougeL": 0.15474814840633067, + "avg_rouge": 0.14130249167795728 + }, + "prompt_3": { + "rouge1": 0.20990935287930465, + "rouge2": 0.05867884234976881, + "rougeL": 0.15469198105008658, + "avg_rouge": 0.14109339209305335 + }, + "prompt_4": { + "rouge1": 0.20494868093526838, + "rouge2": 0.058064620767554594, + "rougeL": 0.14976318600375157, + "avg_rouge": 0.1375921625688582 + }, + "prompt_5": { + "rouge1": 0.20954452685307218, + "rouge2": 0.057658269430444185, + "rougeL": 0.15468782985154383, + "avg_rouge": 0.14063020871168672 + } + }, + "sst2": { + "prompt_1": { + "accuracy": 0.5034403669724771 + }, + "prompt_2": { + "accuracy": 0.4805045871559633 + }, + "prompt_3": { + "accuracy": 0.4988532110091743 + }, + "prompt_4": { + "accuracy": 0.5011467889908257 + }, + "prompt_5": { + "accuracy": 0.5194954128440367 + } + }, + "cola": { + "prompt_1": { + "accuracy": 0.5043144774688398 + }, + "prompt_2": { + "accuracy": 0.5349952061361457 + }, + "prompt_3": { + "accuracy": 0.5129434324065196 + }, + "prompt_4": { + "accuracy": 0.5043144774688398 + }, + "prompt_5": { + "accuracy": 0.5043144774688398 + } + }, + "qqp": { + "prompt_1": { + "accuracy": 0.501 + }, + "prompt_2": { + "accuracy": 0.5135 + }, + "prompt_3": { + "accuracy": 0.54 + }, + "prompt_4": { + "accuracy": 0.497 + }, + "prompt_5": { + "accuracy": 0.5055 + } + }, + "mnli": { + "prompt_1": { + "accuracy": 0.3285 + }, + "prompt_2": { + "accuracy": 0.3205 + }, + "prompt_3": { + "accuracy": 0.32 + }, + "prompt_4": { + "accuracy": 0.336 + }, + "prompt_5": { + "accuracy": 0.3205 + } + }, + "qnli": { + "prompt_1": { + "accuracy": 0.5095 + }, + "prompt_2": { + "accuracy": 0.5 + }, + "prompt_3": { + "accuracy": 0.4975 + }, + "prompt_4": { + "accuracy": 0.4885 + }, + "prompt_5": { + "accuracy": 0.492 + } + }, + "wnli": { + "prompt_1": { + "accuracy": 0.5211267605633803 + }, + "prompt_2": { + "accuracy": 0.5352112676056338 + }, + "prompt_3": { + "accuracy": 0.5070422535211268 + }, + "prompt_4": { + "accuracy": 0.5211267605633803 + }, + "prompt_5": { + "accuracy": 0.5211267605633803 + } + }, + "rte": { + "prompt_1": { + "accuracy": 0.4981949458483754 + }, + "prompt_2": { + "accuracy": 0.48014440433212996 + }, + "prompt_3": { + "accuracy": 0.49097472924187724 + }, + "prompt_4": { + "accuracy": 0.49097472924187724 + }, + "prompt_5": { + "accuracy": 0.48375451263537905 + } + }, + "mrpc": { + "prompt_1": { + "accuracy": 0.45588235294117646 + }, + "prompt_2": { + "accuracy": 0.4387254901960784 + }, + "prompt_3": { + "accuracy": 0.47058823529411764 + }, + "prompt_4": { + "accuracy": 0.3480392156862745 + }, + "prompt_5": { + "accuracy": 0.46568627450980393 + } + } + }, + "five_shot": { + "cross_mmlu": { + "prompt_1": -1 + }, + "cross_logiqa": { + "prompt_1": -1 + }, + "sg_eval": { + "prompt_1": -1 + }, + "cn_eval": { + "prompt_1": -1 + }, + "us_eval": { + "prompt_1": -1 + }, + "ph_eval": { + "prompt_1": -1 + }, + "sing2eng": { + "prompt_1": -1 + }, + "flores_ind2eng": { + "prompt_1": -1 }, "flores_vie2eng": { "prompt_1": -1 @@ -100284,32 +102218,346 @@ "AC3_6": 0.05525902666634465, "AC3_7": 0.0367816091806051 }, - "prompt_4": -1, - "prompt_5": -1 - }, - "cross_logiqa": { - "prompt_1": { - "overall_acc": 0.2759740259740259, + "prompt_4": { + "overall_acc": 0.24666666666666665, "language_acc": { - "Vietnamese": 0.2727272727272727, - "Indonesian": 0.30113636363636365, - "Malay": 0.2840909090909091, - "English": 0.23863636363636365, - "Spanish": 0.2784090909090909, - "Filipino": 0.2784090909090909, - "Chinese": 0.2784090909090909 - }, - "consistency_score_2": 0.41260822510822515, - "consistency_score_3": 0.20974025974025978, - "consistency_score_4": 0.11931818181818184, - "consistency_score_5": 0.0725108225108225, - "consistency_score_6": 0.045454545454545456, - "consistency_score_7": 0.028409090909090908, + "English": 0.28, + "Vietnamese": 0.22, + "Malay": 0.24, + "Indonesian": 0.24, + "Spanish": 0.25333333333333335, + "Chinese": 0.22666666666666666, + "Filipino": 0.26666666666666666 + }, + "consistency_score_2": 0.5425396825396825, + "consistency_score_3": 0.35790476190476195, + "consistency_score_4": 0.25695238095238093, + "consistency_score_5": 0.19460317460317458, + "consistency_score_6": 0.1542857142857143, + "consistency_score_7": 0.12666666666666668, "detailed_consistency_score": { "2_combine": { - "Vietnamese,Indonesian": 0.5738636363636364, - "Vietnamese,Malay": 0.4318181818181818, - "Vietnamese,English": 0.5681818181818182, + "English,Vietnamese": 0.5133333333333333, + "English,Malay": 0.49333333333333335, + "English,Indonesian": 0.5066666666666667, + "English,Spanish": 0.48, + "English,Chinese": 0.42, + "English,Filipino": 0.49333333333333335, + "Vietnamese,Malay": 0.6666666666666666, + "Vietnamese,Indonesian": 0.6266666666666667, + "Vietnamese,Spanish": 0.72, + "Vietnamese,Chinese": 0.4866666666666667, + "Vietnamese,Filipino": 0.5266666666666666, + "Malay,Indonesian": 0.72, + "Malay,Spanish": 0.6, + "Malay,Chinese": 0.5066666666666667, + "Malay,Filipino": 0.5933333333333334, + "Indonesian,Spanish": 0.48, + "Indonesian,Chinese": 0.5066666666666667, + "Indonesian,Filipino": 0.5266666666666666, + "Spanish,Chinese": 0.4066666666666667, + "Spanish,Filipino": 0.5266666666666666, + "Chinese,Filipino": 0.5933333333333334 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.36666666666666664, + "English,Vietnamese,Indonesian": 0.36666666666666664, + "English,Vietnamese,Spanish": 0.4, + "English,Vietnamese,Chinese": 0.26, + "English,Vietnamese,Filipino": 0.32, + "English,Malay,Indonesian": 0.38666666666666666, + "English,Malay,Spanish": 0.34, + "English,Malay,Chinese": 0.24666666666666667, + "English,Malay,Filipino": 0.34, + "English,Indonesian,Spanish": 0.29333333333333333, + "English,Indonesian,Chinese": 0.2733333333333333, + "English,Indonesian,Filipino": 0.31333333333333335, + "English,Spanish,Chinese": 0.23333333333333334, + "English,Spanish,Filipino": 0.31333333333333335, + "English,Chinese,Filipino": 0.30666666666666664, + "Vietnamese,Malay,Indonesian": 0.52, + "Vietnamese,Malay,Spanish": 0.5133333333333333, + "Vietnamese,Malay,Chinese": 0.35333333333333333, + "Vietnamese,Malay,Filipino": 0.42, + "Vietnamese,Indonesian,Spanish": 0.44, + "Vietnamese,Indonesian,Chinese": 0.3466666666666667, + "Vietnamese,Indonesian,Filipino": 0.38, + "Vietnamese,Spanish,Chinese": 0.35333333333333333, + "Vietnamese,Spanish,Filipino": 0.4, + "Vietnamese,Chinese,Filipino": 0.37333333333333335, + "Malay,Indonesian,Spanish": 0.43333333333333335, + "Malay,Indonesian,Chinese": 0.4, + "Malay,Indonesian,Filipino": 0.44666666666666666, + "Malay,Spanish,Chinese": 0.3, + "Malay,Spanish,Filipino": 0.4, + "Malay,Chinese,Filipino": 0.4066666666666667, + "Indonesian,Spanish,Chinese": 0.26666666666666666, + "Indonesian,Spanish,Filipino": 0.31333333333333335, + "Indonesian,Chinese,Filipino": 0.38, + "Spanish,Chinese,Filipino": 0.32 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.3, + "English,Vietnamese,Malay,Spanish": 0.3, + "English,Vietnamese,Malay,Chinese": 0.19333333333333333, + "English,Vietnamese,Malay,Filipino": 0.24666666666666667, + "English,Vietnamese,Indonesian,Spanish": 0.28, + "English,Vietnamese,Indonesian,Chinese": 0.20666666666666667, + "English,Vietnamese,Indonesian,Filipino": 0.24, + "English,Vietnamese,Spanish,Chinese": 0.22666666666666666, + "English,Vietnamese,Spanish,Filipino": 0.25333333333333335, + "English,Vietnamese,Chinese,Filipino": 0.21333333333333335, + "English,Malay,Indonesian,Spanish": 0.24666666666666667, + "English,Malay,Indonesian,Chinese": 0.21333333333333335, + "English,Malay,Indonesian,Filipino": 0.25333333333333335, + "English,Malay,Spanish,Chinese": 0.18, + "English,Malay,Spanish,Filipino": 0.24666666666666667, + "English,Malay,Chinese,Filipino": 0.21333333333333335, + "English,Indonesian,Spanish,Chinese": 0.18, + "English,Indonesian,Spanish,Filipino": 0.19333333333333333, + "English,Indonesian,Chinese,Filipino": 0.22, + "English,Spanish,Chinese,Filipino": 0.18666666666666668, + "Vietnamese,Malay,Indonesian,Spanish": 0.3933333333333333, + "Vietnamese,Malay,Indonesian,Chinese": 0.28, + "Vietnamese,Malay,Indonesian,Filipino": 0.32, + "Vietnamese,Malay,Spanish,Chinese": 0.2866666666666667, + "Vietnamese,Malay,Spanish,Filipino": 0.3333333333333333, + "Vietnamese,Malay,Chinese,Filipino": 0.31333333333333335, + "Vietnamese,Indonesian,Spanish,Chinese": 0.26, + "Vietnamese,Indonesian,Spanish,Filipino": 0.28, + "Vietnamese,Indonesian,Chinese,Filipino": 0.29333333333333333, + "Vietnamese,Spanish,Chinese,Filipino": 0.2866666666666667, + "Malay,Indonesian,Spanish,Chinese": 0.23333333333333334, + "Malay,Indonesian,Spanish,Filipino": 0.2866666666666667, + "Malay,Indonesian,Chinese,Filipino": 0.32666666666666666, + "Malay,Spanish,Chinese,Filipino": 0.2733333333333333, + "Indonesian,Spanish,Chinese,Filipino": 0.23333333333333334 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.23333333333333334, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.16, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.19333333333333333, + "English,Vietnamese,Malay,Spanish,Chinese": 0.18, + "English,Vietnamese,Malay,Spanish,Filipino": 0.20666666666666667, + "English,Vietnamese,Malay,Chinese,Filipino": 0.17333333333333334, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.18, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.18, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.17333333333333334, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.18, + "English,Malay,Indonesian,Spanish,Chinese": 0.14666666666666667, + "English,Malay,Indonesian,Spanish,Filipino": 0.16666666666666666, + "English,Malay,Indonesian,Chinese,Filipino": 0.18, + "English,Malay,Spanish,Chinese,Filipino": 0.16, + "English,Indonesian,Spanish,Chinese,Filipino": 0.14666666666666667, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.22666666666666666, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.25333333333333335, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.24666666666666667, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.26, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.22666666666666666, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.21333333333333335 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.14666666666666667, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.15333333333333332, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.14, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.16, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.14666666666666667, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.12666666666666668, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.20666666666666667 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.12666666666666668 + } + }, + "AC3_2": 0.3391418610457912, + "AC3_3": 0.2920520898487732, + "AC3_4": 0.2517044880992896, + "AC3_5": 0.21756354911136747, + "AC3_6": 0.1898337291688063, + "AC3_7": 0.1673809523361182 + }, + "prompt_5": { + "overall_acc": 0.2314285714285714, + "language_acc": { + "English": 0.2733333333333333, + "Vietnamese": 0.22, + "Malay": 0.22, + "Indonesian": 0.22666666666666666, + "Spanish": 0.22, + "Chinese": 0.24, + "Filipino": 0.22 + }, + "consistency_score_2": 0.4095238095238095, + "consistency_score_3": 0.211047619047619, + "consistency_score_4": 0.11885714285714286, + "consistency_score_5": 0.06888888888888889, + "consistency_score_6": 0.039047619047619046, + "consistency_score_7": 0.02, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.49333333333333335, + "English,Malay": 0.31333333333333335, + "English,Indonesian": 0.3333333333333333, + "English,Spanish": 0.5, + "English,Chinese": 0.26666666666666666, + "English,Filipino": 0.36666666666666664, + "Vietnamese,Malay": 0.5266666666666666, + "Vietnamese,Indonesian": 0.46, + "Vietnamese,Spanish": 0.6, + "Vietnamese,Chinese": 0.37333333333333335, + "Vietnamese,Filipino": 0.3466666666666667, + "Malay,Indonesian": 0.5333333333333333, + "Malay,Spanish": 0.4533333333333333, + "Malay,Chinese": 0.38666666666666666, + "Malay,Filipino": 0.4, + "Indonesian,Spanish": 0.37333333333333335, + "Indonesian,Chinese": 0.4066666666666667, + "Indonesian,Filipino": 0.44666666666666666, + "Spanish,Chinese": 0.31333333333333335, + "Spanish,Filipino": 0.3333333333333333, + "Chinese,Filipino": 0.37333333333333335 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.21333333333333335, + "English,Vietnamese,Indonesian": 0.21333333333333335, + "English,Vietnamese,Spanish": 0.3466666666666667, + "English,Vietnamese,Chinese": 0.18666666666666668, + "English,Vietnamese,Filipino": 0.20666666666666667, + "English,Malay,Indonesian": 0.16666666666666666, + "English,Malay,Spanish": 0.2, + "English,Malay,Chinese": 0.09333333333333334, + "English,Malay,Filipino": 0.16666666666666666, + "English,Indonesian,Spanish": 0.18666666666666668, + "English,Indonesian,Chinese": 0.14, + "English,Indonesian,Filipino": 0.18, + "English,Spanish,Chinese": 0.16, + "English,Spanish,Filipino": 0.20666666666666667, + "English,Chinese,Filipino": 0.12666666666666668, + "Vietnamese,Malay,Indonesian": 0.30666666666666664, + "Vietnamese,Malay,Spanish": 0.34, + "Vietnamese,Malay,Chinese": 0.22666666666666666, + "Vietnamese,Malay,Filipino": 0.22666666666666666, + "Vietnamese,Indonesian,Spanish": 0.28, + "Vietnamese,Indonesian,Chinese": 0.22666666666666666, + "Vietnamese,Indonesian,Filipino": 0.22, + "Vietnamese,Spanish,Chinese": 0.22666666666666666, + "Vietnamese,Spanish,Filipino": 0.23333333333333334, + "Vietnamese,Chinese,Filipino": 0.2, + "Malay,Indonesian,Spanish": 0.24, + "Malay,Indonesian,Chinese": 0.25333333333333335, + "Malay,Indonesian,Filipino": 0.26666666666666666, + "Malay,Spanish,Chinese": 0.17333333333333334, + "Malay,Spanish,Filipino": 0.18666666666666668, + "Malay,Chinese,Filipino": 0.21333333333333335, + "Indonesian,Spanish,Chinese": 0.18, + "Indonesian,Spanish,Filipino": 0.18, + "Indonesian,Chinese,Filipino": 0.24, + "Spanish,Chinese,Filipino": 0.17333333333333334 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.12666666666666668, + "English,Vietnamese,Malay,Spanish": 0.14, + "English,Vietnamese,Malay,Chinese": 0.07333333333333333, + "English,Vietnamese,Malay,Filipino": 0.11333333333333333, + "English,Vietnamese,Indonesian,Spanish": 0.13333333333333333, + "English,Vietnamese,Indonesian,Chinese": 0.1, + "English,Vietnamese,Indonesian,Filipino": 0.11333333333333333, + "English,Vietnamese,Spanish,Chinese": 0.12, + "English,Vietnamese,Spanish,Filipino": 0.15333333333333332, + "English,Vietnamese,Chinese,Filipino": 0.1, + "English,Malay,Indonesian,Spanish": 0.09333333333333334, + "English,Malay,Indonesian,Chinese": 0.06, + "English,Malay,Indonesian,Filipino": 0.1, + "English,Malay,Spanish,Chinese": 0.06, + "English,Malay,Spanish,Filipino": 0.11333333333333333, + "English,Malay,Chinese,Filipino": 0.06666666666666667, + "English,Indonesian,Spanish,Chinese": 0.08666666666666667, + "English,Indonesian,Spanish,Filipino": 0.1, + "English,Indonesian,Chinese,Filipino": 0.08666666666666667, + "English,Spanish,Chinese,Filipino": 0.08666666666666667, + "Vietnamese,Malay,Indonesian,Spanish": 0.2, + "Vietnamese,Malay,Indonesian,Chinese": 0.15333333333333332, + "Vietnamese,Malay,Indonesian,Filipino": 0.16666666666666666, + "Vietnamese,Malay,Spanish,Chinese": 0.13333333333333333, + "Vietnamese,Malay,Spanish,Filipino": 0.14666666666666667, + "Vietnamese,Malay,Chinese,Filipino": 0.14666666666666667, + "Vietnamese,Indonesian,Spanish,Chinese": 0.14, + "Vietnamese,Indonesian,Spanish,Filipino": 0.14, + "Vietnamese,Indonesian,Chinese,Filipino": 0.15333333333333332, + "Vietnamese,Spanish,Chinese,Filipino": 0.13333333333333333, + "Malay,Indonesian,Spanish,Chinese": 0.11333333333333333, + "Malay,Indonesian,Spanish,Filipino": 0.11333333333333333, + "Malay,Indonesian,Chinese,Filipino": 0.16, + "Malay,Spanish,Chinese,Filipino": 0.11333333333333333, + "Indonesian,Spanish,Chinese,Filipino": 0.12 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.06666666666666667, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.04666666666666667, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.07333333333333333, + "English,Vietnamese,Malay,Spanish,Chinese": 0.04, + "English,Vietnamese,Malay,Spanish,Filipino": 0.08, + "English,Vietnamese,Malay,Chinese,Filipino": 0.05333333333333334, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.06, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.07333333333333333, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.06666666666666667, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.06666666666666667, + "English,Malay,Indonesian,Spanish,Chinese": 0.03333333333333333, + "English,Malay,Indonesian,Spanish,Filipino": 0.05333333333333334, + "English,Malay,Indonesian,Chinese,Filipino": 0.04666666666666667, + "English,Malay,Spanish,Chinese,Filipino": 0.04666666666666667, + "English,Indonesian,Spanish,Chinese,Filipino": 0.05333333333333334, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.09333333333333334, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.1, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.12, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.09333333333333334, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.1, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.08 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.02, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.04, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.04, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.03333333333333333, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.04, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.02666666666666667, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.07333333333333333 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.02 + } + }, + "AC3_2": 0.29573338989230225, + "AC3_3": 0.22076871036151985, + "AC3_4": 0.15705429965220424, + "AC3_5": 0.106173361486845, + "AC3_6": 0.066820925528615, + "AC3_7": 0.03681818180353822 + } + }, + "cross_logiqa": { + "prompt_1": { + "overall_acc": 0.2759740259740259, + "language_acc": { + "Vietnamese": 0.2727272727272727, + "Indonesian": 0.30113636363636365, + "Malay": 0.2840909090909091, + "English": 0.23863636363636365, + "Spanish": 0.2784090909090909, + "Filipino": 0.2784090909090909, + "Chinese": 0.2784090909090909 + }, + "consistency_score_2": 0.41260822510822515, + "consistency_score_3": 0.20974025974025978, + "consistency_score_4": 0.11931818181818184, + "consistency_score_5": 0.0725108225108225, + "consistency_score_6": 0.045454545454545456, + "consistency_score_7": 0.028409090909090908, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.5738636363636364, + "Vietnamese,Malay": 0.4318181818181818, + "Vietnamese,English": 0.5681818181818182, "Vietnamese,Spanish": 0.4090909090909091, "Vietnamese,Filipino": 0.30113636363636365, "Vietnamese,Chinese": 0.4772727272727273, @@ -100762,21 +103010,339 @@ "AC3_6": 0.015728041422939577, "AC3_7": 0.0 }, - "prompt_4": -1, - "prompt_5": -1 - }, - "sg_eval": { - "prompt_1": { - "accuracy": 0.32038834951456313 + "prompt_4": { + "overall_acc": 0.26055194805194803, + "language_acc": { + "Vietnamese": 0.26704545454545453, + "Indonesian": 0.2784090909090909, + "Malay": 0.2840909090909091, + "English": 0.2215909090909091, + "Spanish": 0.2556818181818182, + "Filipino": 0.2556818181818182, + "Chinese": 0.26136363636363635 + }, + "consistency_score_2": 0.34983766233766234, + "consistency_score_3": 0.14545454545454548, + "consistency_score_4": 0.0732142857142857, + "consistency_score_5": 0.04626623376623377, + "consistency_score_6": 0.0349025974025974, + "consistency_score_7": 0.028409090909090908, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.3522727272727273, + "Vietnamese,Malay": 0.4659090909090909, + "Vietnamese,English": 0.45454545454545453, + "Vietnamese,Spanish": 0.48295454545454547, + "Vietnamese,Filipino": 0.24431818181818182, + "Vietnamese,Chinese": 0.24431818181818182, + "Indonesian,Malay": 0.4090909090909091, + "Indonesian,English": 0.38636363636363635, + "Indonesian,Spanish": 0.29545454545454547, + "Indonesian,Filipino": 0.4147727272727273, + "Indonesian,Chinese": 0.4034090909090909, + "Malay,English": 0.38636363636363635, + "Malay,Spanish": 0.35795454545454547, + "Malay,Filipino": 0.3125, + "Malay,Chinese": 0.2556818181818182, + "English,Spanish": 0.4090909090909091, + "English,Filipino": 0.2727272727272727, + "English,Chinese": 0.3352272727272727, + "Spanish,Filipino": 0.2159090909090909, + "Spanish,Chinese": 0.25, + "Filipino,Chinese": 0.3977272727272727 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.2159090909090909, + "Vietnamese,Indonesian,English": 0.17613636363636365, + "Vietnamese,Indonesian,Spanish": 0.18181818181818182, + "Vietnamese,Indonesian,Filipino": 0.13068181818181818, + "Vietnamese,Indonesian,Chinese": 0.11363636363636363, + "Vietnamese,Malay,English": 0.2215909090909091, + "Vietnamese,Malay,Spanish": 0.25, + "Vietnamese,Malay,Filipino": 0.125, + "Vietnamese,Malay,Chinese": 0.10227272727272728, + "Vietnamese,English,Spanish": 0.2727272727272727, + "Vietnamese,English,Filipino": 0.11363636363636363, + "Vietnamese,English,Chinese": 0.13636363636363635, + "Vietnamese,Spanish,Filipino": 0.10227272727272728, + "Vietnamese,Spanish,Chinese": 0.10795454545454546, + "Vietnamese,Filipino,Chinese": 0.10795454545454546, + "Indonesian,Malay,English": 0.17613636363636365, + "Indonesian,Malay,Spanish": 0.16477272727272727, + "Indonesian,Malay,Filipino": 0.17045454545454544, + "Indonesian,Malay,Chinese": 0.1534090909090909, + "Indonesian,English,Spanish": 0.1590909090909091, + "Indonesian,English,Filipino": 0.14772727272727273, + "Indonesian,English,Chinese": 0.1590909090909091, + "Indonesian,Spanish,Filipino": 0.10795454545454546, + "Indonesian,Spanish,Chinese": 0.10795454545454546, + "Indonesian,Filipino,Chinese": 0.19318181818181818, + "Malay,English,Spanish": 0.1875, + "Malay,English,Filipino": 0.13068181818181818, + "Malay,English,Chinese": 0.14772727272727273, + "Malay,Spanish,Filipino": 0.09090909090909091, + "Malay,Spanish,Chinese": 0.09659090909090909, + "Malay,Filipino,Chinese": 0.13068181818181818, + "English,Spanish,Filipino": 0.07386363636363637, + "English,Spanish,Chinese": 0.09659090909090909, + "English,Filipino,Chinese": 0.13068181818181818, + "Spanish,Filipino,Chinese": 0.10795454545454546 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.11931818181818182, + "Vietnamese,Indonesian,Malay,Spanish": 0.125, + "Vietnamese,Indonesian,Malay,Filipino": 0.07954545454545454, + "Vietnamese,Indonesian,Malay,Chinese": 0.07386363636363637, + "Vietnamese,Indonesian,English,Spanish": 0.13068181818181818, + "Vietnamese,Indonesian,English,Filipino": 0.0625, + "Vietnamese,Indonesian,English,Chinese": 0.06818181818181818, + "Vietnamese,Indonesian,Spanish,Filipino": 0.0625, + "Vietnamese,Indonesian,Spanish,Chinese": 0.056818181818181816, + "Vietnamese,Indonesian,Filipino,Chinese": 0.07386363636363637, + "Vietnamese,Malay,English,Spanish": 0.1534090909090909, + "Vietnamese,Malay,English,Filipino": 0.08522727272727272, + "Vietnamese,Malay,English,Chinese": 0.06818181818181818, + "Vietnamese,Malay,Spanish,Filipino": 0.0625, + "Vietnamese,Malay,Spanish,Chinese": 0.0625, + "Vietnamese,Malay,Filipino,Chinese": 0.07386363636363637, + "Vietnamese,English,Spanish,Filipino": 0.0625, + "Vietnamese,English,Spanish,Chinese": 0.07954545454545454, + "Vietnamese,English,Filipino,Chinese": 0.05113636363636364, + "Vietnamese,Spanish,Filipino,Chinese": 0.03977272727272727, + "Indonesian,Malay,English,Spanish": 0.10227272727272728, + "Indonesian,Malay,English,Filipino": 0.06818181818181818, + "Indonesian,Malay,English,Chinese": 0.07386363636363637, + "Indonesian,Malay,Spanish,Filipino": 0.06818181818181818, + "Indonesian,Malay,Spanish,Chinese": 0.06818181818181818, + "Indonesian,Malay,Filipino,Chinese": 0.08522727272727272, + "Indonesian,English,Spanish,Filipino": 0.05113636363636364, + "Indonesian,English,Spanish,Chinese": 0.05113636363636364, + "Indonesian,English,Filipino,Chinese": 0.07386363636363637, + "Indonesian,Spanish,Filipino,Chinese": 0.0625, + "Malay,English,Spanish,Filipino": 0.045454545454545456, + "Malay,English,Spanish,Chinese": 0.056818181818181816, + "Malay,English,Filipino,Chinese": 0.07386363636363637, + "Malay,Spanish,Filipino,Chinese": 0.05113636363636364, + "English,Spanish,Filipino,Chinese": 0.03977272727272727 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.09659090909090909, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.05113636363636364, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.05113636363636364, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.05113636363636364, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.045454545454545456, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.056818181818181816, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.045454545454545456, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.05113636363636364, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.03977272727272727, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.03409090909090909, + "Vietnamese,Malay,English,Spanish,Filipino": 0.045454545454545456, + "Vietnamese,Malay,English,Spanish,Chinese": 0.05113636363636364, + "Vietnamese,Malay,English,Filipino,Chinese": 0.05113636363636364, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.03977272727272727, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.03409090909090909, + "Indonesian,Malay,English,Spanish,Filipino": 0.03977272727272727, + "Indonesian,Malay,English,Spanish,Chinese": 0.03977272727272727, + "Indonesian,Malay,English,Filipino,Chinese": 0.03977272727272727, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.045454545454545456, + "Indonesian,English,Spanish,Filipino,Chinese": 0.028409090909090908, + "Malay,English,Spanish,Filipino,Chinese": 0.03409090909090909 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.03977272727272727, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.03977272727272727, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.03977272727272727, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.03409090909090909, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.028409090909090908, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.03409090909090909, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.028409090909090908 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.028409090909090908 + } + }, + "AC3_2": 0.29866459997870176, + "AC3_3": 0.18668896073334287, + "AC3_4": 0.11430829624146072, + "AC3_5": 0.07857915891069081, + "AC3_6": 0.061558976716712815, + "AC3_7": 0.05123212459922631 }, - "prompt_2": { - "accuracy": 0.2912621359223301 + "prompt_5": { + "overall_acc": 0.2556818181818182, + "language_acc": { + "Vietnamese": 0.2556818181818182, + "Indonesian": 0.25, + "Malay": 0.26136363636363635, + "English": 0.23863636363636365, + "Spanish": 0.2784090909090909, + "Filipino": 0.23295454545454544, + "Chinese": 0.2727272727272727 + }, + "consistency_score_2": 0.3484848484848484, + "consistency_score_3": 0.1431818181818182, + "consistency_score_4": 0.06542207792207791, + "consistency_score_5": 0.03246753246753246, + "consistency_score_6": 0.015422077922077924, + "consistency_score_7": 0.005681818181818182, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.2840909090909091, + "Vietnamese,Malay": 0.4772727272727273, + "Vietnamese,English": 0.4659090909090909, + "Vietnamese,Spanish": 0.5227272727272727, + "Vietnamese,Filipino": 0.14772727272727273, + "Vietnamese,Chinese": 0.19318181818181818, + "Indonesian,Malay": 0.3352272727272727, + "Indonesian,English": 0.42045454545454547, + "Indonesian,Spanish": 0.2727272727272727, + "Indonesian,Filipino": 0.4318181818181818, + "Indonesian,Chinese": 0.3977272727272727, + "Malay,English": 0.3693181818181818, + "Malay,Spanish": 0.45454545454545453, + "Malay,Filipino": 0.29545454545454547, + "Malay,Chinese": 0.24431818181818182, + "English,Spanish": 0.4659090909090909, + "English,Filipino": 0.30113636363636365, + "English,Chinese": 0.3693181818181818, + "Spanish,Filipino": 0.2215909090909091, + "Spanish,Chinese": 0.25, + "Filipino,Chinese": 0.3977272727272727 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.16477272727272727, + "Vietnamese,Indonesian,English": 0.17613636363636365, + "Vietnamese,Indonesian,Spanish": 0.14204545454545456, + "Vietnamese,Indonesian,Filipino": 0.07954545454545454, + "Vietnamese,Indonesian,Chinese": 0.07954545454545454, + "Vietnamese,Malay,English": 0.25, + "Vietnamese,Malay,Spanish": 0.30113636363636365, + "Vietnamese,Malay,Filipino": 0.07386363636363637, + "Vietnamese,Malay,Chinese": 0.06818181818181818, + "Vietnamese,English,Spanish": 0.2897727272727273, + "Vietnamese,English,Filipino": 0.08522727272727272, + "Vietnamese,English,Chinese": 0.13068181818181818, + "Vietnamese,Spanish,Filipino": 0.07386363636363637, + "Vietnamese,Spanish,Chinese": 0.09659090909090909, + "Vietnamese,Filipino,Chinese": 0.06818181818181818, + "Indonesian,Malay,English": 0.14204545454545456, + "Indonesian,Malay,Spanish": 0.16477272727272727, + "Indonesian,Malay,Filipino": 0.13068181818181818, + "Indonesian,Malay,Chinese": 0.10227272727272728, + "Indonesian,English,Spanish": 0.16477272727272727, + "Indonesian,English,Filipino": 0.20454545454545456, + "Indonesian,English,Chinese": 0.19886363636363635, + "Indonesian,Spanish,Filipino": 0.10227272727272728, + "Indonesian,Spanish,Chinese": 0.09090909090909091, + "Indonesian,Filipino,Chinese": 0.23863636363636365, + "Malay,English,Spanish": 0.24431818181818182, + "Malay,English,Filipino": 0.11363636363636363, + "Malay,English,Chinese": 0.11931818181818182, + "Malay,Spanish,Filipino": 0.11931818181818182, + "Malay,Spanish,Chinese": 0.10795454545454546, + "Malay,Filipino,Chinese": 0.13068181818181818, + "English,Spanish,Filipino": 0.11363636363636363, + "English,Spanish,Chinese": 0.14772727272727273, + "English,Filipino,Chinese": 0.1875, + "Spanish,Filipino,Chinese": 0.10795454545454546 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.09659090909090909, + "Vietnamese,Indonesian,Malay,Spanish": 0.10795454545454546, + "Vietnamese,Indonesian,Malay,Filipino": 0.03977272727272727, + "Vietnamese,Indonesian,Malay,Chinese": 0.028409090909090908, + "Vietnamese,Indonesian,English,Spanish": 0.09090909090909091, + "Vietnamese,Indonesian,English,Filipino": 0.0625, + "Vietnamese,Indonesian,English,Chinese": 0.056818181818181816, + "Vietnamese,Indonesian,Spanish,Filipino": 0.045454545454545456, + "Vietnamese,Indonesian,Spanish,Chinese": 0.028409090909090908, + "Vietnamese,Indonesian,Filipino,Chinese": 0.03977272727272727, + "Vietnamese,Malay,English,Spanish": 0.17613636363636365, + "Vietnamese,Malay,English,Filipino": 0.045454545454545456, + "Vietnamese,Malay,English,Chinese": 0.045454545454545456, + "Vietnamese,Malay,Spanish,Filipino": 0.045454545454545456, + "Vietnamese,Malay,Spanish,Chinese": 0.045454545454545456, + "Vietnamese,Malay,Filipino,Chinese": 0.03977272727272727, + "Vietnamese,English,Spanish,Filipino": 0.05113636363636364, + "Vietnamese,English,Spanish,Chinese": 0.07386363636363637, + "Vietnamese,English,Filipino,Chinese": 0.045454545454545456, + "Vietnamese,Spanish,Filipino,Chinese": 0.03977272727272727, + "Indonesian,Malay,English,Spanish": 0.10795454545454546, + "Indonesian,Malay,English,Filipino": 0.056818181818181816, + "Indonesian,Malay,English,Chinese": 0.056818181818181816, + "Indonesian,Malay,Spanish,Filipino": 0.0625, + "Indonesian,Malay,Spanish,Chinese": 0.05113636363636364, + "Indonesian,Malay,Filipino,Chinese": 0.06818181818181818, + "Indonesian,English,Spanish,Filipino": 0.06818181818181818, + "Indonesian,English,Spanish,Chinese": 0.0625, + "Indonesian,English,Filipino,Chinese": 0.13636363636363635, + "Indonesian,Spanish,Filipino,Chinese": 0.06818181818181818, + "Malay,English,Spanish,Filipino": 0.06818181818181818, + "Malay,English,Spanish,Chinese": 0.07386363636363637, + "Malay,English,Filipino,Chinese": 0.07386363636363637, + "Malay,Spanish,Filipino,Chinese": 0.056818181818181816, + "English,Spanish,Filipino,Chinese": 0.07386363636363637 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.07386363636363637, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.028409090909090908, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.017045454545454544, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.03409090909090909, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.017045454545454544, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.022727272727272728, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.03409090909090909, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.017045454545454544, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.028409090909090908, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.022727272727272728, + "Vietnamese,Malay,English,Spanish,Filipino": 0.03409090909090909, + "Vietnamese,Malay,English,Spanish,Chinese": 0.03409090909090909, + "Vietnamese,Malay,English,Filipino,Chinese": 0.022727272727272728, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.028409090909090908, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.028409090909090908, + "Indonesian,Malay,English,Spanish,Filipino": 0.03977272727272727, + "Indonesian,Malay,English,Spanish,Chinese": 0.03409090909090909, + "Indonesian,Malay,English,Filipino,Chinese": 0.03977272727272727, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.03977272727272727, + "Indonesian,English,Spanish,Filipino,Chinese": 0.045454545454545456, + "Malay,English,Spanish,Filipino,Chinese": 0.03977272727272727 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.022727272727272728, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.005681818181818182, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.011363636363636364, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.017045454545454544, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.011363636363636364, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.017045454545454544, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.022727272727272728 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.005681818181818182 + } + }, + "AC3_2": 0.29495582782236807, + "AC3_3": 0.1835664335204112, + "AC3_4": 0.1041858166786486, + "AC3_5": 0.05761843788013199, + "AC3_6": 0.029089548165644484, + "AC3_7": 0.011116600786260526 + } + }, + "sg_eval": { + "prompt_1": { + "accuracy": 0.32038834951456313 + }, + "prompt_2": { + "accuracy": 0.2912621359223301 }, "prompt_3": { "accuracy": 0.30097087378640774 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.27184466019417475 + }, + "prompt_5": { + "accuracy": 0.2815533980582524 + } }, "cn_eval": { "prompt_1": { @@ -100788,8 +103354,12 @@ "prompt_3": { "accuracy": 0.3047619047619048 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.22857142857142856 + }, + "prompt_5": { + "accuracy": 0.2857142857142857 + } }, "us_eval": { "prompt_1": { @@ -100801,8 +103371,12 @@ "prompt_3": { "accuracy": 0.2523364485981308 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.22429906542056074 + }, + "prompt_5": { + "accuracy": 0.2616822429906542 + } }, "ph_eval": { "prompt_1": { @@ -100850,8 +103424,36 @@ "geography": 0.2 } }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.24, + "category_acc": { + "brand": 0.1, + "demographics": 0.6, + "biology": 0.4, + "history": 0.2, + "literature": 0.1, + "politics": 0.1, + "culture": 0.2, + "film": 0.3, + "law": 0.3, + "geography": 0.3 + } + }, + "prompt_5": { + "accuracy": 0.11, + "category_acc": { + "brand": 0.0, + "demographics": 0.0, + "biology": 0.2, + "history": 0.13333333333333333, + "literature": 0.1, + "politics": 0.0, + "culture": 0.1, + "film": 0.1, + "law": 0.2, + "geography": 0.2 + } + } }, "sing2eng": { "prompt_1": { @@ -100863,8 +103465,12 @@ "prompt_3": { "bleu_score": 0.06198404186724674 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "bleu_score": 0.05405168319767091 + }, + "prompt_5": { + "bleu_score": 0.04584217954886731 + } }, "flores_ind2eng": { "prompt_1": { @@ -100876,8 +103482,12 @@ "prompt_3": { "bleu_score": 0.0671747898873427 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "bleu_score": 0.07826796506401824 + }, + "prompt_5": { + "bleu_score": 0.039087715748873174 + } }, "flores_vie2eng": { "prompt_1": { @@ -100889,8 +103499,12 @@ "prompt_3": { "bleu_score": 0.05176664146658031 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "bleu_score": 0.06817467987602832 + }, + "prompt_5": { + "bleu_score": 0.042225809505108826 + } }, "flores_zho2eng": { "prompt_1": { @@ -100902,8 +103516,12 @@ "prompt_3": { "bleu_score": 0.054601907354379434 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "bleu_score": 0.05472829487488319 + }, + "prompt_5": { + "bleu_score": 0.052523096185344835 + } }, "flores_zsm2eng": { "prompt_1": { @@ -100915,8 +103533,12 @@ "prompt_3": { "bleu_score": 0.0660582091155768 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "bleu_score": 0.07723223967687388 + }, + "prompt_5": { + "bleu_score": 0.042509582336678946 + } }, "mmlu": { "prompt_1": { @@ -100928,8 +103550,12 @@ "prompt_3": { "accuracy": 0.26254375729288215 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.2532088681446908 + }, + "prompt_5": { + "accuracy": 0.27887981330221706 + } }, "mmlu_full": { "prompt_1": { @@ -101118,8 +103744,130 @@ "college_biology": 0.3006993006993007 } }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.2664998212370397, + "category_acc": { + "high_school_european_history": 0.2621951219512195, + "business_ethics": 0.2828282828282828, + "clinical_knowledge": 0.23484848484848486, + "medical_genetics": 0.30303030303030304, + "high_school_us_history": 0.22167487684729065, + "high_school_physics": 0.28, + "high_school_world_history": 0.25, + "virology": 0.24848484848484848, + "high_school_microeconomics": 0.25738396624472576, + "econometrics": 0.23893805309734514, + "college_computer_science": 0.36363636363636365, + "high_school_biology": 0.24271844660194175, + "abstract_algebra": 0.2222222222222222, + "professional_accounting": 0.25622775800711745, + "philosophy": 0.2903225806451613, + "professional_medicine": 0.25461254612546125, + "nutrition": 0.3081967213114754, + "global_facts": 0.29292929292929293, + "machine_learning": 0.2702702702702703, + "security_studies": 0.29098360655737704, + "public_relations": 0.23853211009174313, + "professional_psychology": 0.24058919803600654, + "prehistory": 0.29102167182662536, + "anatomy": 0.34328358208955223, + "human_sexuality": 0.34615384615384615, + "college_medicine": 0.26744186046511625, + "high_school_government_and_politics": 0.3020833333333333, + "college_chemistry": 0.30303030303030304, + "logical_fallacies": 0.2839506172839506, + "high_school_geography": 0.26903553299492383, + "elementary_mathematics": 0.22811671087533156, + "human_aging": 0.21621621621621623, + "college_mathematics": 0.32323232323232326, + "high_school_psychology": 0.2665441176470588, + "formal_logic": 0.288, + "high_school_statistics": 0.26976744186046514, + "international_law": 0.3, + "high_school_mathematics": 0.25650557620817843, + "high_school_computer_science": 0.2828282828282828, + "conceptual_physics": 0.24786324786324787, + "miscellaneous": 0.2672634271099744, + "high_school_chemistry": 0.27722772277227725, + "marketing": 0.3090128755364807, + "professional_law": 0.2628832354859752, + "management": 0.2549019607843137, + "college_physics": 0.19801980198019803, + "jurisprudence": 0.24299065420560748, + "world_religions": 0.24705882352941178, + "sociology": 0.25, + "us_foreign_policy": 0.35353535353535354, + "high_school_macroeconomics": 0.2827763496143959, + "computer_security": 0.18181818181818182, + "moral_scenarios": 0.2494407158836689, + "moral_disputes": 0.24057971014492754, + "electrical_engineering": 0.2986111111111111, + "astronomy": 0.36423841059602646, + "college_biology": 0.3006993006993007 + } + }, + "prompt_5": { + "accuracy": 0.2679299249195567, + "category_acc": { + "high_school_european_history": 0.2621951219512195, + "business_ethics": 0.2727272727272727, + "clinical_knowledge": 0.24621212121212122, + "medical_genetics": 0.2828282828282828, + "high_school_us_history": 0.20689655172413793, + "high_school_physics": 0.23333333333333334, + "high_school_world_history": 0.2754237288135593, + "virology": 0.21818181818181817, + "high_school_microeconomics": 0.28270042194092826, + "econometrics": 0.26548672566371684, + "college_computer_science": 0.3838383838383838, + "high_school_biology": 0.2621359223300971, + "abstract_algebra": 0.2727272727272727, + "professional_accounting": 0.2313167259786477, + "philosophy": 0.3, + "professional_medicine": 0.30996309963099633, + "nutrition": 0.3081967213114754, + "global_facts": 0.2222222222222222, + "machine_learning": 0.1981981981981982, + "security_studies": 0.2786885245901639, + "public_relations": 0.29357798165137616, + "professional_psychology": 0.26677577741407527, + "prehistory": 0.28173374613003094, + "anatomy": 0.373134328358209, + "human_sexuality": 0.3230769230769231, + "college_medicine": 0.26744186046511625, + "high_school_government_and_politics": 0.3020833333333333, + "college_chemistry": 0.3333333333333333, + "logical_fallacies": 0.32098765432098764, + "high_school_geography": 0.28426395939086296, + "elementary_mathematics": 0.20954907161803712, + "human_aging": 0.18468468468468469, + "college_mathematics": 0.30303030303030304, + "high_school_psychology": 0.2610294117647059, + "formal_logic": 0.24, + "high_school_statistics": 0.2558139534883721, + "international_law": 0.2833333333333333, + "high_school_mathematics": 0.2527881040892193, + "high_school_computer_science": 0.3434343434343434, + "conceptual_physics": 0.25213675213675213, + "miscellaneous": 0.2544757033248082, + "high_school_chemistry": 0.30198019801980197, + "marketing": 0.30042918454935624, + "professional_law": 0.25962165688193084, + "management": 0.27450980392156865, + "college_physics": 0.19801980198019803, + "jurisprudence": 0.24299065420560748, + "world_religions": 0.25882352941176473, + "sociology": 0.26, + "us_foreign_policy": 0.3333333333333333, + "high_school_macroeconomics": 0.2930591259640103, + "computer_security": 0.21212121212121213, + "moral_scenarios": 0.27181208053691275, + "moral_disputes": 0.2318840579710145, + "electrical_engineering": 0.2986111111111111, + "astronomy": 0.32450331125827814, + "college_biology": 0.2727272727272727 + } + } }, "c_eval": { "prompt_1": { @@ -101131,8 +103879,12 @@ "prompt_3": { "accuracy": 0.2578008915304606 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.2578008915304606 + }, + "prompt_5": { + "accuracy": 0.24665676077265974 + } }, "c_eval_full": { "prompt_1": { @@ -101306,8 +104058,120 @@ "physician": 0.3148148148148148 } }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.2590286425902864, + "category_acc": { + "computer_network": 0.041666666666666664, + "operating_system": 0.4583333333333333, + "computer_architecture": 0.2692307692307692, + "college_programming": 0.21428571428571427, + "college_physics": 0.2916666666666667, + "college_chemistry": 0.2413793103448276, + "advanced_mathematics": 0.3333333333333333, + "probability_and_statistics": 0.34782608695652173, + "discrete_mathematics": 0.3333333333333333, + "electrical_engineer": 0.2619047619047619, + "metrology_engineer": 0.1724137931034483, + "high_school_mathematics": 0.2608695652173913, + "high_school_physics": 0.125, + "high_school_chemistry": 0.2916666666666667, + "high_school_biology": 0.16666666666666666, + "middle_school_mathematics": 0.2916666666666667, + "middle_school_biology": 0.34615384615384615, + "middle_school_physics": 0.2916666666666667, + "middle_school_chemistry": 0.36, + "veterinary_medicine": 0.21428571428571427, + "college_economics": 0.2, + "business_administration": 0.18421052631578946, + "marxism": 0.3333333333333333, + "mao_zedong_thought": 0.06896551724137931, + "education_science": 0.20588235294117646, + "teacher_qualification": 0.24489795918367346, + "high_school_politics": 0.08333333333333333, + "high_school_geography": 0.3333333333333333, + "middle_school_politics": 0.4230769230769231, + "middle_school_geography": 0.4117647058823529, + "modern_chinese_history": 0.42857142857142855, + "ideological_and_moral_cultivation": 0.20833333333333334, + "logic": 0.3333333333333333, + "law": 0.27586206896551724, + "chinese_language_and_literature": 0.39285714285714285, + "art_studies": 0.21052631578947367, + "professional_tour_guide": 0.3235294117647059, + "legal_professional": 0.25, + "high_school_chinese": 0.25, + "high_school_history": 0.04, + "middle_school_history": 0.2962962962962963, + "civil_servant": 0.19230769230769232, + "sports_science": 0.25, + "plant_protection": 0.2222222222222222, + "basic_medicine": 0.5, + "clinical_medicine": 0.25925925925925924, + "urban_and_rural_planner": 0.19607843137254902, + "accountant": 0.24074074074074073, + "fire_engineer": 0.19444444444444445, + "environmental_impact_assessment_engineer": 0.3055555555555556, + "tax_accountant": 0.2962962962962963, + "physician": 0.3148148148148148 + } + }, + "prompt_5": { + "accuracy": 0.263387297633873, + "category_acc": { + "computer_network": 0.125, + "operating_system": 0.4583333333333333, + "computer_architecture": 0.2692307692307692, + "college_programming": 0.23809523809523808, + "college_physics": 0.2916666666666667, + "college_chemistry": 0.20689655172413793, + "advanced_mathematics": 0.2916666666666667, + "probability_and_statistics": 0.30434782608695654, + "discrete_mathematics": 0.3333333333333333, + "electrical_engineer": 0.23809523809523808, + "metrology_engineer": 0.1724137931034483, + "high_school_mathematics": 0.43478260869565216, + "high_school_physics": 0.16666666666666666, + "high_school_chemistry": 0.2916666666666667, + "high_school_biology": 0.16666666666666666, + "middle_school_mathematics": 0.25, + "middle_school_biology": 0.3076923076923077, + "middle_school_physics": 0.2916666666666667, + "middle_school_chemistry": 0.36, + "veterinary_medicine": 0.14285714285714285, + "college_economics": 0.23333333333333334, + "business_administration": 0.18421052631578946, + "marxism": 0.3333333333333333, + "mao_zedong_thought": 0.1724137931034483, + "education_science": 0.20588235294117646, + "teacher_qualification": 0.2857142857142857, + "high_school_politics": 0.4166666666666667, + "high_school_geography": 0.25, + "middle_school_politics": 0.3076923076923077, + "middle_school_geography": 0.35294117647058826, + "modern_chinese_history": 0.39285714285714285, + "ideological_and_moral_cultivation": 0.20833333333333334, + "logic": 0.3333333333333333, + "law": 0.27586206896551724, + "chinese_language_and_literature": 0.32142857142857145, + "art_studies": 0.21052631578947367, + "professional_tour_guide": 0.23529411764705882, + "legal_professional": 0.17857142857142858, + "high_school_chinese": 0.25, + "high_school_history": 0.08, + "middle_school_history": 0.37037037037037035, + "civil_servant": 0.2692307692307692, + "sports_science": 0.25, + "plant_protection": 0.14814814814814814, + "basic_medicine": 0.4166666666666667, + "clinical_medicine": 0.25925925925925924, + "urban_and_rural_planner": 0.27450980392156865, + "accountant": 0.25925925925925924, + "fire_engineer": 0.25, + "environmental_impact_assessment_engineer": 0.2777777777777778, + "tax_accountant": 0.2777777777777778, + "physician": 0.2777777777777778 + } + } }, "cmmlu": { "prompt_1": { @@ -101319,8 +104183,12 @@ "prompt_3": { "accuracy": 0.2616487455197133 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.2114695340501792 + }, + "prompt_5": { + "accuracy": 0.21505376344086022 + } }, "cmmlu_full": { "prompt_1": { @@ -101539,78 +104407,240 @@ "world_religions": 0.20625 } }, - "prompt_4": -1, - "prompt_5": -1 - }, - "zbench": { - "prompt_1": { - "accuracy": 0.24242424242424243 - }, - "prompt_2": { - "accuracy": 0.18181818181818182 - }, - "prompt_3": { - "accuracy": 0.12121212121212122 - }, - "prompt_4": -1, - "prompt_5": -1 - }, - "ind_emotion": { - "prompt_1": { - "accuracy": 0.15454545454545454 - }, - "prompt_2": { - "accuracy": 0.11136363636363636 - }, - "prompt_3": { - "accuracy": 0.12045454545454545 - }, - "prompt_4": -1, - "prompt_5": -1 - }, - "ocnli": { - "prompt_1": { - "accuracy": 0.3430508474576271 - }, - "prompt_2": { - "accuracy": 0.3325423728813559 - }, - "prompt_3": { - "accuracy": 0.3213559322033898 - }, - "prompt_4": -1, - "prompt_5": -1 - }, - "c3": { - "prompt_1": { - "accuracy": 0.27786088257292446 - }, - "prompt_2": { - "accuracy": 0.29768137621540763 - }, - "prompt_3": { - "accuracy": 0.29468960359012714 - }, - "prompt_4": -1, - "prompt_5": -1 - }, - "dream": { - "prompt_1": { - "accuracy": 0.3596276335129838 - }, - "prompt_2": { - "accuracy": 0.3444390004899559 - }, - "prompt_3": { - "accuracy": 0.3512983831455169 + "prompt_4": { + "accuracy": 0.2506475565532723, + "category_acc": { + "agronomy": 0.2603550295857988, + "anatomy": 0.22972972972972974, + "ancient_chinese": 0.2073170731707317, + "arts": 0.225, + "astronomy": 0.296969696969697, + "business_ethics": 0.2535885167464115, + "chinese_civil_service_exam": 0.225, + "chinese_driving_rule": 0.2595419847328244, + "chinese_food_culture": 0.22058823529411764, + "chinese_foreign_policy": 0.2803738317757009, + "chinese_history": 0.2260061919504644, + "chinese_literature": 0.2696078431372549, + "chinese_teacher_qualification": 0.2849162011173184, + "clinical_knowledge": 0.2742616033755274, + "college_actuarial_science": 0.2358490566037736, + "college_education": 0.22429906542056074, + "college_engineering_hydrology": 0.2169811320754717, + "college_law": 0.3148148148148148, + "college_mathematics": 0.1619047619047619, + "college_medical_statistics": 0.24528301886792453, + "college_medicine": 0.2490842490842491, + "computer_science": 0.2549019607843137, + "computer_security": 0.29239766081871343, + "conceptual_physics": 0.23809523809523808, + "construction_project_management": 0.26618705035971224, + "economics": 0.27672955974842767, + "education": 0.22085889570552147, + "electrical_engineering": 0.22674418604651161, + "elementary_chinese": 0.19444444444444445, + "elementary_commonsense": 0.2828282828282828, + "elementary_information_and_technology": 0.2605042016806723, + "elementary_mathematics": 0.30869565217391304, + "ethnology": 0.25925925925925924, + "food_science": 0.2727272727272727, + "genetics": 0.23295454545454544, + "global_facts": 0.21476510067114093, + "high_school_biology": 0.27218934911242604, + "high_school_chemistry": 0.20454545454545456, + "high_school_geography": 0.22033898305084745, + "high_school_mathematics": 0.25609756097560976, + "high_school_physics": 0.24545454545454545, + "high_school_politics": 0.21678321678321677, + "human_sexuality": 0.2698412698412698, + "international_law": 0.25405405405405407, + "journalism": 0.25, + "jurisprudence": 0.25304136253041365, + "legal_and_moral_basis": 0.24299065420560748, + "logical": 0.24390243902439024, + "machine_learning": 0.3114754098360656, + "management": 0.23333333333333334, + "marketing": 0.2611111111111111, + "marxist_theory": 0.2857142857142857, + "modern_chinese": 0.20689655172413793, + "nutrition": 0.27586206896551724, + "philosophy": 0.23809523809523808, + "professional_accounting": 0.26285714285714284, + "professional_law": 0.2843601895734597, + "professional_medicine": 0.23670212765957446, + "professional_psychology": 0.21551724137931033, + "public_relations": 0.22988505747126436, + "security_study": 0.23703703703703705, + "sociology": 0.26548672566371684, + "sports_science": 0.2787878787878788, + "traditional_chinese_medicine": 0.2702702702702703, + "virology": 0.26627218934911245, + "world_history": 0.22981366459627328, + "world_religions": 0.26875 + } }, - "prompt_4": -1, - "prompt_5": -1 - }, - "samsum": { - "prompt_1": { - "rouge1": 0.20708500246015654, - "rouge2": 0.06559723811622607, + "prompt_5": { + "accuracy": 0.2526333966499741, + "category_acc": { + "agronomy": 0.28402366863905326, + "anatomy": 0.25675675675675674, + "ancient_chinese": 0.22560975609756098, + "arts": 0.225, + "astronomy": 0.22424242424242424, + "business_ethics": 0.2631578947368421, + "chinese_civil_service_exam": 0.28125, + "chinese_driving_rule": 0.24427480916030533, + "chinese_food_culture": 0.2426470588235294, + "chinese_foreign_policy": 0.2523364485981308, + "chinese_history": 0.23529411764705882, + "chinese_literature": 0.25, + "chinese_teacher_qualification": 0.25139664804469275, + "clinical_knowledge": 0.25738396624472576, + "college_actuarial_science": 0.2358490566037736, + "college_education": 0.17757009345794392, + "college_engineering_hydrology": 0.22641509433962265, + "college_law": 0.3055555555555556, + "college_mathematics": 0.22857142857142856, + "college_medical_statistics": 0.2358490566037736, + "college_medicine": 0.23443223443223443, + "computer_science": 0.23529411764705882, + "computer_security": 0.2807017543859649, + "conceptual_physics": 0.2653061224489796, + "construction_project_management": 0.2517985611510791, + "economics": 0.27672955974842767, + "education": 0.2883435582822086, + "electrical_engineering": 0.23255813953488372, + "elementary_chinese": 0.17857142857142858, + "elementary_commonsense": 0.26262626262626265, + "elementary_information_and_technology": 0.25630252100840334, + "elementary_mathematics": 0.27391304347826084, + "ethnology": 0.26666666666666666, + "food_science": 0.24475524475524477, + "genetics": 0.2556818181818182, + "global_facts": 0.22818791946308725, + "high_school_biology": 0.25443786982248523, + "high_school_chemistry": 0.24242424242424243, + "high_school_geography": 0.211864406779661, + "high_school_mathematics": 0.22560975609756098, + "high_school_physics": 0.20909090909090908, + "high_school_politics": 0.25874125874125875, + "human_sexuality": 0.29365079365079366, + "international_law": 0.2648648648648649, + "journalism": 0.2441860465116279, + "jurisprudence": 0.2725060827250608, + "legal_and_moral_basis": 0.2616822429906542, + "logical": 0.2764227642276423, + "machine_learning": 0.3524590163934426, + "management": 0.23333333333333334, + "marketing": 0.22777777777777777, + "marxist_theory": 0.2804232804232804, + "modern_chinese": 0.19827586206896552, + "nutrition": 0.2827586206896552, + "philosophy": 0.2857142857142857, + "professional_accounting": 0.2742857142857143, + "professional_law": 0.27488151658767773, + "professional_medicine": 0.24468085106382978, + "professional_psychology": 0.23706896551724138, + "public_relations": 0.25862068965517243, + "security_study": 0.2740740740740741, + "sociology": 0.24778761061946902, + "sports_science": 0.23636363636363636, + "traditional_chinese_medicine": 0.2756756756756757, + "virology": 0.26627218934911245, + "world_history": 0.2670807453416149, + "world_religions": 0.26875 + } + } + }, + "zbench": { + "prompt_1": { + "accuracy": 0.24242424242424243 + }, + "prompt_2": { + "accuracy": 0.18181818181818182 + }, + "prompt_3": { + "accuracy": 0.12121212121212122 + }, + "prompt_4": { + "accuracy": 0.2727272727272727 + }, + "prompt_5": { + "accuracy": 0.24242424242424243 + } + }, + "ind_emotion": { + "prompt_1": { + "accuracy": 0.15454545454545454 + }, + "prompt_2": { + "accuracy": 0.11136363636363636 + }, + "prompt_3": { + "accuracy": 0.12045454545454545 + }, + "prompt_4": { + "accuracy": 0.15454545454545454 + }, + "prompt_5": { + "accuracy": 0.15454545454545454 + } + }, + "ocnli": { + "prompt_1": { + "accuracy": 0.3430508474576271 + }, + "prompt_2": { + "accuracy": 0.3325423728813559 + }, + "prompt_3": { + "accuracy": 0.3213559322033898 + }, + "prompt_4": { + "accuracy": 0.34 + }, + "prompt_5": { + "accuracy": 0.34440677966101696 + } + }, + "c3": { + "prompt_1": { + "accuracy": 0.27786088257292446 + }, + "prompt_2": { + "accuracy": 0.29768137621540763 + }, + "prompt_3": { + "accuracy": 0.29468960359012714 + }, + "prompt_4": { + "accuracy": 0.27299925205684367 + }, + "prompt_5": { + "accuracy": 0.2819745699326851 + } + }, + "dream": { + "prompt_1": { + "accuracy": 0.3596276335129838 + }, + "prompt_2": { + "accuracy": 0.3444390004899559 + }, + "prompt_3": { + "accuracy": 0.3512983831455169 + }, + "prompt_4": { + "accuracy": 0.35864772170504655 + }, + "prompt_5": { + "accuracy": 0.3468887800097991 + } + }, + "samsum": { + "prompt_1": { + "rouge1": 0.20708500246015654, + "rouge2": 0.06559723811622607, "rougeL": 0.16037315643415484, "avg_rouge": 0.1443517990035125 }, @@ -101626,8 +104656,18 @@ "rougeL": 0.12129155012650203, "avg_rouge": 0.10672613888760114 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "rouge1": 0.20094662523236545, + "rouge2": 0.06376961391715008, + "rougeL": 0.15574785708691388, + "avg_rouge": 0.14015469874547648 + }, + "prompt_5": { + "rouge1": 0.17822855705326188, + "rouge2": 0.05576964640513887, + "rougeL": 0.1400374512637407, + "avg_rouge": 0.12467855157404716 + } }, "dialogsum": { "prompt_1": { @@ -101648,8 +104688,18 @@ "rougeL": 0.15538268808417005, "avg_rouge": 0.14145461181370092 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "rouge1": 0.2086362444037295, + "rouge2": 0.057526481552556304, + "rougeL": 0.15419209708250917, + "avg_rouge": 0.140118274346265 + }, + "prompt_5": { + "rouge1": 0.17118387735342586, + "rouge2": 0.04314537927614836, + "rougeL": 0.13291313913576316, + "avg_rouge": 0.11574746525511247 + } }, "sst2": { "prompt_1": { @@ -101661,8 +104711,12 @@ "prompt_3": { "accuracy": 0.5286697247706422 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.5298165137614679 + }, + "prompt_5": { + "accuracy": 0.5745412844036697 + } }, "cola": { "prompt_1": { @@ -101674,8 +104728,12 @@ "prompt_3": { "accuracy": 0.5292425695110259 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.6395014381591563 + }, + "prompt_5": { + "accuracy": 0.6903163950143816 + } }, "qqp": { "prompt_1": { @@ -101687,8 +104745,12 @@ "prompt_3": { "accuracy": 0.506 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.492 + }, + "prompt_5": { + "accuracy": 0.465 + } }, "mnli": { "prompt_1": { @@ -101700,8 +104762,12 @@ "prompt_3": { "accuracy": 0.354 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.3425 + }, + "prompt_5": { + "accuracy": 0.354 + } }, "qnli": { "prompt_1": { @@ -101713,8 +104779,12 @@ "prompt_3": { "accuracy": 0.5295 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.509 + }, + "prompt_5": { + "accuracy": 0.494 + } }, "wnli": { "prompt_1": { @@ -101726,8 +104796,12 @@ "prompt_3": { "accuracy": 0.4225352112676056 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.43661971830985913 + }, + "prompt_5": { + "accuracy": 0.4647887323943662 + } }, "rte": { "prompt_1": { @@ -101739,8 +104813,12 @@ "prompt_3": { "accuracy": 0.5126353790613718 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.5018050541516246 + }, + "prompt_5": { + "accuracy": 0.516245487364621 + } }, "mrpc": { "prompt_1": { @@ -101752,8 +104830,12 @@ "prompt_3": { "accuracy": 0.5318627450980392 }, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.41911764705882354 + }, + "prompt_5": { + "accuracy": 0.5245098039215687 + } } }, "five_shot": { @@ -102131,560 +105213,5981 @@ "model_link": "https://huggingface.co/aisingapore/sealion7b-instruct-nc", "zero_shot": { "cross_mmlu": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "cross_logiqa": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "sg_eval": { "prompt_1": { - "accuracy": 0.2524271844660194 - }, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "cn_eval": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "us_eval": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, + "overall_acc": 0.2780952380952381, + "language_acc": { + "English": 0.25333333333333335, + "Vietnamese": 0.30666666666666664, + "Malay": 0.26, + "Indonesian": 0.2733333333333333, + "Spanish": 0.29333333333333333, + "Chinese": 0.29333333333333333, + "Filipino": 0.26666666666666666 + }, + "consistency_score_2": 0.4965079365079366, + "consistency_score_3": 0.2963809523809524, + "consistency_score_4": 0.19314285714285712, + "consistency_score_5": 0.133968253968254, + "consistency_score_6": 0.09714285714285716, + "consistency_score_7": 0.07333333333333333, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.49333333333333335, + "English,Malay": 0.49333333333333335, + "English,Indonesian": 0.5733333333333334, + "English,Spanish": 0.58, + "English,Chinese": 0.4666666666666667, + "English,Filipino": 0.4866666666666667, + "Vietnamese,Malay": 0.44, + "Vietnamese,Indonesian": 0.54, + "Vietnamese,Spanish": 0.5133333333333333, + "Vietnamese,Chinese": 0.4533333333333333, + "Vietnamese,Filipino": 0.48, + "Malay,Indonesian": 0.5533333333333333, + "Malay,Spanish": 0.47333333333333333, + "Malay,Chinese": 0.46, + "Malay,Filipino": 0.4533333333333333, + "Indonesian,Spanish": 0.5733333333333334, + "Indonesian,Chinese": 0.47333333333333333, + "Indonesian,Filipino": 0.5266666666666666, + "Spanish,Chinese": 0.4066666666666667, + "Spanish,Filipino": 0.49333333333333335, + "Chinese,Filipino": 0.49333333333333335 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.2733333333333333, + "English,Vietnamese,Indonesian": 0.35333333333333333, + "English,Vietnamese,Spanish": 0.34, + "English,Vietnamese,Chinese": 0.29333333333333333, + "English,Vietnamese,Filipino": 0.2733333333333333, + "English,Malay,Indonesian": 0.36, + "English,Malay,Spanish": 0.34, + "English,Malay,Chinese": 0.26, + "English,Malay,Filipino": 0.28, + "English,Indonesian,Spanish": 0.38666666666666666, + "English,Indonesian,Chinese": 0.29333333333333333, + "English,Indonesian,Filipino": 0.3333333333333333, + "English,Spanish,Chinese": 0.2733333333333333, + "English,Spanish,Filipino": 0.30666666666666664, + "English,Chinese,Filipino": 0.2733333333333333, + "Vietnamese,Malay,Indonesian": 0.30666666666666664, + "Vietnamese,Malay,Spanish": 0.26666666666666666, + "Vietnamese,Malay,Chinese": 0.22666666666666666, + "Vietnamese,Malay,Filipino": 0.24666666666666667, + "Vietnamese,Indonesian,Spanish": 0.36, + "Vietnamese,Indonesian,Chinese": 0.3, + "Vietnamese,Indonesian,Filipino": 0.32666666666666666, + "Vietnamese,Spanish,Chinese": 0.26, + "Vietnamese,Spanish,Filipino": 0.30666666666666664, + "Vietnamese,Chinese,Filipino": 0.28, + "Malay,Indonesian,Spanish": 0.35333333333333333, + "Malay,Indonesian,Chinese": 0.29333333333333333, + "Malay,Indonesian,Filipino": 0.3, + "Malay,Spanish,Chinese": 0.23333333333333334, + "Malay,Spanish,Filipino": 0.25333333333333335, + "Malay,Chinese,Filipino": 0.26, + "Indonesian,Spanish,Chinese": 0.28, + "Indonesian,Spanish,Filipino": 0.3333333333333333, + "Indonesian,Chinese,Filipino": 0.2866666666666667, + "Spanish,Chinese,Filipino": 0.26 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.22, + "English,Vietnamese,Malay,Spanish": 0.20666666666666667, + "English,Vietnamese,Malay,Chinese": 0.16666666666666666, + "English,Vietnamese,Malay,Filipino": 0.18, + "English,Vietnamese,Indonesian,Spanish": 0.23333333333333334, + "English,Vietnamese,Indonesian,Chinese": 0.21333333333333335, + "English,Vietnamese,Indonesian,Filipino": 0.22, + "English,Vietnamese,Spanish,Chinese": 0.2, + "English,Vietnamese,Spanish,Filipino": 0.19333333333333333, + "English,Vietnamese,Chinese,Filipino": 0.18666666666666668, + "English,Malay,Indonesian,Spanish": 0.26666666666666666, + "English,Malay,Indonesian,Chinese": 0.18666666666666668, + "English,Malay,Indonesian,Filipino": 0.22, + "English,Malay,Spanish,Chinese": 0.17333333333333334, + "English,Malay,Spanish,Filipino": 0.18666666666666668, + "English,Malay,Chinese,Filipino": 0.16666666666666666, + "English,Indonesian,Spanish,Chinese": 0.20666666666666667, + "English,Indonesian,Spanish,Filipino": 0.22666666666666666, + "English,Indonesian,Chinese,Filipino": 0.18666666666666668, + "English,Spanish,Chinese,Filipino": 0.17333333333333334, + "Vietnamese,Malay,Indonesian,Spanish": 0.22, + "Vietnamese,Malay,Indonesian,Chinese": 0.17333333333333334, + "Vietnamese,Malay,Indonesian,Filipino": 0.19333333333333333, + "Vietnamese,Malay,Spanish,Chinese": 0.12666666666666668, + "Vietnamese,Malay,Spanish,Filipino": 0.16, + "Vietnamese,Malay,Chinese,Filipino": 0.15333333333333332, + "Vietnamese,Indonesian,Spanish,Chinese": 0.19333333333333333, + "Vietnamese,Indonesian,Spanish,Filipino": 0.25333333333333335, + "Vietnamese,Indonesian,Chinese,Filipino": 0.2, + "Vietnamese,Spanish,Chinese,Filipino": 0.19333333333333333, + "Malay,Indonesian,Spanish,Chinese": 0.18, + "Malay,Indonesian,Spanish,Filipino": 0.19333333333333333, + "Malay,Indonesian,Chinese,Filipino": 0.17333333333333334, + "Malay,Spanish,Chinese,Filipino": 0.14, + "Indonesian,Spanish,Chinese,Filipino": 0.19333333333333333 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.16666666666666666, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.13333333333333333, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.15333333333333332, + "English,Vietnamese,Malay,Spanish,Chinese": 0.11333333333333333, + "English,Vietnamese,Malay,Spanish,Filipino": 0.12666666666666668, + "English,Vietnamese,Malay,Chinese,Filipino": 0.12, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.14666666666666667, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.16, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.14666666666666667, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.14, + "English,Malay,Indonesian,Spanish,Chinese": 0.14, + "English,Malay,Indonesian,Spanish,Filipino": 0.15333333333333332, + "English,Malay,Indonesian,Chinese,Filipino": 0.12666666666666668, + "English,Malay,Spanish,Chinese,Filipino": 0.10666666666666667, + "English,Indonesian,Spanish,Chinese,Filipino": 0.14, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.10666666666666667, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.14666666666666667, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.12, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.09333333333333334, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.16, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.11333333333333333 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.09333333333333334, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.11333333333333333, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.1, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.08, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.11333333333333333, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.09333333333333334, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.08666666666666667 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.07333333333333333 + } + }, + "AC3_2": 0.3565089773154117, + "AC3_3": 0.28694707586264384, + "AC3_4": 0.2279616583417393, + "AC3_5": 0.18082617942636417, + "AC3_6": 0.1439883973510401, + "AC3_7": 0.11606142724791386 + }, + "prompt_2": { + "overall_acc": 0.2552380952380952, + "language_acc": { + "English": 0.3, + "Vietnamese": 0.25333333333333335, + "Malay": 0.2733333333333333, + "Indonesian": 0.25333333333333335, + "Spanish": 0.20666666666666667, + "Chinese": 0.2, + "Filipino": 0.3 + }, + "consistency_score_2": 0.29333333333333333, + "consistency_score_3": 0.10266666666666666, + "consistency_score_4": 0.04095238095238096, + "consistency_score_5": 0.016825396825396827, + "consistency_score_6": 0.005714285714285714, + "consistency_score_7": 0.0, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.4, + "English,Malay": 0.25333333333333335, + "English,Indonesian": 0.32, + "English,Spanish": 0.22, + "English,Chinese": 0.26, + "English,Filipino": 0.32, + "Vietnamese,Malay": 0.2866666666666667, + "Vietnamese,Indonesian": 0.36, + "Vietnamese,Spanish": 0.29333333333333333, + "Vietnamese,Chinese": 0.29333333333333333, + "Vietnamese,Filipino": 0.30666666666666664, + "Malay,Indonesian": 0.36666666666666664, + "Malay,Spanish": 0.21333333333333335, + "Malay,Chinese": 0.26666666666666666, + "Malay,Filipino": 0.29333333333333333, + "Indonesian,Spanish": 0.26, + "Indonesian,Chinese": 0.32, + "Indonesian,Filipino": 0.38666666666666666, + "Spanish,Chinese": 0.23333333333333334, + "Spanish,Filipino": 0.19333333333333333, + "Chinese,Filipino": 0.31333333333333335 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.10666666666666667, + "English,Vietnamese,Indonesian": 0.16, + "English,Vietnamese,Spanish": 0.10666666666666667, + "English,Vietnamese,Chinese": 0.13333333333333333, + "English,Vietnamese,Filipino": 0.13333333333333333, + "English,Malay,Indonesian": 0.12, + "English,Malay,Spanish": 0.04, + "English,Malay,Chinese": 0.07333333333333333, + "English,Malay,Filipino": 0.07333333333333333, + "English,Indonesian,Spanish": 0.08666666666666667, + "English,Indonesian,Chinese": 0.10666666666666667, + "English,Indonesian,Filipino": 0.15333333333333332, + "English,Spanish,Chinese": 0.06666666666666667, + "English,Spanish,Filipino": 0.06666666666666667, + "English,Chinese,Filipino": 0.12, + "Vietnamese,Malay,Indonesian": 0.15333333333333332, + "Vietnamese,Malay,Spanish": 0.07333333333333333, + "Vietnamese,Malay,Chinese": 0.1, + "Vietnamese,Malay,Filipino": 0.12, + "Vietnamese,Indonesian,Spanish": 0.11333333333333333, + "Vietnamese,Indonesian,Chinese": 0.12666666666666668, + "Vietnamese,Indonesian,Filipino": 0.16666666666666666, + "Vietnamese,Spanish,Chinese": 0.08, + "Vietnamese,Spanish,Filipino": 0.04666666666666667, + "Vietnamese,Chinese,Filipino": 0.12666666666666668, + "Malay,Indonesian,Spanish": 0.08666666666666667, + "Malay,Indonesian,Chinese": 0.11333333333333333, + "Malay,Indonesian,Filipino": 0.14, + "Malay,Spanish,Chinese": 0.03333333333333333, + "Malay,Spanish,Filipino": 0.05333333333333334, + "Malay,Chinese,Filipino": 0.10666666666666667, + "Indonesian,Spanish,Chinese": 0.09333333333333334, + "Indonesian,Spanish,Filipino": 0.09333333333333334, + "Indonesian,Chinese,Filipino": 0.16666666666666666, + "Spanish,Chinese,Filipino": 0.05333333333333334 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.05333333333333334, + "English,Vietnamese,Malay,Spanish": 0.013333333333333334, + "English,Vietnamese,Malay,Chinese": 0.04, + "English,Vietnamese,Malay,Filipino": 0.04, + "English,Vietnamese,Indonesian,Spanish": 0.05333333333333334, + "English,Vietnamese,Indonesian,Chinese": 0.06, + "English,Vietnamese,Indonesian,Filipino": 0.08, + "English,Vietnamese,Spanish,Chinese": 0.04666666666666667, + "English,Vietnamese,Spanish,Filipino": 0.03333333333333333, + "English,Vietnamese,Chinese,Filipino": 0.06666666666666667, + "English,Malay,Indonesian,Spanish": 0.03333333333333333, + "English,Malay,Indonesian,Chinese": 0.04, + "English,Malay,Indonesian,Filipino": 0.04666666666666667, + "English,Malay,Spanish,Chinese": 0.006666666666666667, + "English,Malay,Spanish,Filipino": 0.013333333333333334, + "English,Malay,Chinese,Filipino": 0.03333333333333333, + "English,Indonesian,Spanish,Chinese": 0.04, + "English,Indonesian,Spanish,Filipino": 0.04, + "English,Indonesian,Chinese,Filipino": 0.06666666666666667, + "English,Spanish,Chinese,Filipino": 0.03333333333333333, + "Vietnamese,Malay,Indonesian,Spanish": 0.04, + "Vietnamese,Malay,Indonesian,Chinese": 0.06, + "Vietnamese,Malay,Indonesian,Filipino": 0.07333333333333333, + "Vietnamese,Malay,Spanish,Chinese": 0.013333333333333334, + "Vietnamese,Malay,Spanish,Filipino": 0.013333333333333334, + "Vietnamese,Malay,Chinese,Filipino": 0.05333333333333334, + "Vietnamese,Indonesian,Spanish,Chinese": 0.03333333333333333, + "Vietnamese,Indonesian,Spanish,Filipino": 0.04, + "Vietnamese,Indonesian,Chinese,Filipino": 0.08666666666666667, + "Vietnamese,Spanish,Chinese,Filipino": 0.02666666666666667, + "Malay,Indonesian,Spanish,Chinese": 0.006666666666666667, + "Malay,Indonesian,Spanish,Filipino": 0.03333333333333333, + "Malay,Indonesian,Chinese,Filipino": 0.07333333333333333, + "Malay,Spanish,Chinese,Filipino": 0.0, + "Indonesian,Spanish,Chinese,Filipino": 0.04 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.013333333333333334, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.02, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.02666666666666667, + "English,Vietnamese,Malay,Spanish,Chinese": 0.0, + "English,Vietnamese,Malay,Spanish,Filipino": 0.006666666666666667, + "English,Vietnamese,Malay,Chinese,Filipino": 0.02, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.02666666666666667, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.02666666666666667, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.04, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.02666666666666667, + "English,Malay,Indonesian,Spanish,Chinese": 0.006666666666666667, + "English,Malay,Indonesian,Spanish,Filipino": 0.013333333333333334, + "English,Malay,Indonesian,Chinese,Filipino": 0.02, + "English,Malay,Spanish,Chinese,Filipino": 0.0, + "English,Indonesian,Spanish,Chinese,Filipino": 0.02666666666666667, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.0, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.013333333333333334, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.04666666666666667, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.0, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.02, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.0 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.0, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.006666666666666667, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.013333333333333334, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.0, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.02, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.0, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.0 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.0 + } + }, + "AC3_2": 0.2729629629132041, + "AC3_3": 0.14643249951558787, + "AC3_4": 0.07058030927030631, + "AC3_5": 0.03156970605051767, + "AC3_6": 0.011178310736070876, + "AC3_7": 0.0 + }, + "prompt_3": { + "overall_acc": 0.24952380952380954, + "language_acc": { + "English": 0.24666666666666667, + "Vietnamese": 0.25333333333333335, + "Malay": 0.26666666666666666, + "Indonesian": 0.2866666666666667, + "Spanish": 0.2733333333333333, + "Chinese": 0.23333333333333334, + "Filipino": 0.18666666666666668 + }, + "consistency_score_2": 0.2565079365079364, + "consistency_score_3": 0.07409523809523808, + "consistency_score_4": 0.02876190476190476, + "consistency_score_5": 0.01492063492063492, + "consistency_score_6": 0.009523809523809523, + "consistency_score_7": 0.006666666666666667, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.32, + "English,Malay": 0.20666666666666667, + "English,Indonesian": 0.21333333333333335, + "English,Spanish": 0.26666666666666666, + "English,Chinese": 0.24, + "English,Filipino": 0.22, + "Vietnamese,Malay": 0.24, + "Vietnamese,Indonesian": 0.28, + "Vietnamese,Spanish": 0.30666666666666664, + "Vietnamese,Chinese": 0.23333333333333334, + "Vietnamese,Filipino": 0.22666666666666666, + "Malay,Indonesian": 0.24, + "Malay,Spanish": 0.25333333333333335, + "Malay,Chinese": 0.24, + "Malay,Filipino": 0.3, + "Indonesian,Spanish": 0.25333333333333335, + "Indonesian,Chinese": 0.29333333333333333, + "Indonesian,Filipino": 0.29333333333333333, + "Spanish,Chinese": 0.2733333333333333, + "Spanish,Filipino": 0.21333333333333335, + "Chinese,Filipino": 0.2733333333333333 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.06, + "English,Vietnamese,Indonesian": 0.07333333333333333, + "English,Vietnamese,Spanish": 0.10666666666666667, + "English,Vietnamese,Chinese": 0.08, + "English,Vietnamese,Filipino": 0.07333333333333333, + "English,Malay,Indonesian": 0.05333333333333334, + "English,Malay,Spanish": 0.04, + "English,Malay,Chinese": 0.06666666666666667, + "English,Malay,Filipino": 0.05333333333333334, + "English,Indonesian,Spanish": 0.07333333333333333, + "English,Indonesian,Chinese": 0.06666666666666667, + "English,Indonesian,Filipino": 0.04666666666666667, + "English,Spanish,Chinese": 0.06, + "English,Spanish,Filipino": 0.06, + "English,Chinese,Filipino": 0.06, + "Vietnamese,Malay,Indonesian": 0.08666666666666667, + "Vietnamese,Malay,Spanish": 0.05333333333333334, + "Vietnamese,Malay,Chinese": 0.04666666666666667, + "Vietnamese,Malay,Filipino": 0.08666666666666667, + "Vietnamese,Indonesian,Spanish": 0.08666666666666667, + "Vietnamese,Indonesian,Chinese": 0.08666666666666667, + "Vietnamese,Indonesian,Filipino": 0.07333333333333333, + "Vietnamese,Spanish,Chinese": 0.10666666666666667, + "Vietnamese,Spanish,Filipino": 0.06, + "Vietnamese,Chinese,Filipino": 0.07333333333333333, + "Malay,Indonesian,Spanish": 0.07333333333333333, + "Malay,Indonesian,Chinese": 0.06666666666666667, + "Malay,Indonesian,Filipino": 0.09333333333333334, + "Malay,Spanish,Chinese": 0.08, + "Malay,Spanish,Filipino": 0.08666666666666667, + "Malay,Chinese,Filipino": 0.10666666666666667, + "Indonesian,Spanish,Chinese": 0.08666666666666667, + "Indonesian,Spanish,Filipino": 0.08, + "Indonesian,Chinese,Filipino": 0.09333333333333334, + "Spanish,Chinese,Filipino": 0.09333333333333334 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.03333333333333333, + "English,Vietnamese,Malay,Spanish": 0.02, + "English,Vietnamese,Malay,Chinese": 0.02, + "English,Vietnamese,Malay,Filipino": 0.02666666666666667, + "English,Vietnamese,Indonesian,Spanish": 0.02666666666666667, + "English,Vietnamese,Indonesian,Chinese": 0.02, + "English,Vietnamese,Indonesian,Filipino": 0.02, + "English,Vietnamese,Spanish,Chinese": 0.04666666666666667, + "English,Vietnamese,Spanish,Filipino": 0.02666666666666667, + "English,Vietnamese,Chinese,Filipino": 0.04, + "English,Malay,Indonesian,Spanish": 0.013333333333333334, + "English,Malay,Indonesian,Chinese": 0.02666666666666667, + "English,Malay,Indonesian,Filipino": 0.013333333333333334, + "English,Malay,Spanish,Chinese": 0.013333333333333334, + "English,Malay,Spanish,Filipino": 0.02, + "English,Malay,Chinese,Filipino": 0.03333333333333333, + "English,Indonesian,Spanish,Chinese": 0.02, + "English,Indonesian,Spanish,Filipino": 0.02, + "English,Indonesian,Chinese,Filipino": 0.02, + "English,Spanish,Chinese,Filipino": 0.02, + "Vietnamese,Malay,Indonesian,Spanish": 0.02666666666666667, + "Vietnamese,Malay,Indonesian,Chinese": 0.02, + "Vietnamese,Malay,Indonesian,Filipino": 0.04, + "Vietnamese,Malay,Spanish,Chinese": 0.03333333333333333, + "Vietnamese,Malay,Spanish,Filipino": 0.02666666666666667, + "Vietnamese,Malay,Chinese,Filipino": 0.03333333333333333, + "Vietnamese,Indonesian,Spanish,Chinese": 0.03333333333333333, + "Vietnamese,Indonesian,Spanish,Filipino": 0.03333333333333333, + "Vietnamese,Indonesian,Chinese,Filipino": 0.02666666666666667, + "Vietnamese,Spanish,Chinese,Filipino": 0.04, + "Malay,Indonesian,Spanish,Chinese": 0.04, + "Malay,Indonesian,Spanish,Filipino": 0.03333333333333333, + "Malay,Indonesian,Chinese,Filipino": 0.04666666666666667, + "Malay,Spanish,Chinese,Filipino": 0.05333333333333334, + "Indonesian,Spanish,Chinese,Filipino": 0.04 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.013333333333333334, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.006666666666666667, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.006666666666666667, + "English,Vietnamese,Malay,Spanish,Chinese": 0.013333333333333334, + "English,Vietnamese,Malay,Spanish,Filipino": 0.013333333333333334, + "English,Vietnamese,Malay,Chinese,Filipino": 0.02, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.013333333333333334, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.013333333333333334, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.006666666666666667, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.02, + "English,Malay,Indonesian,Spanish,Chinese": 0.006666666666666667, + "English,Malay,Indonesian,Spanish,Filipino": 0.006666666666666667, + "English,Malay,Indonesian,Chinese,Filipino": 0.013333333333333334, + "English,Malay,Spanish,Chinese,Filipino": 0.013333333333333334, + "English,Indonesian,Spanish,Chinese,Filipino": 0.006666666666666667, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.02, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.02, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.02, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.02666666666666667, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.02, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.03333333333333333 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.006666666666666667, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.006666666666666667, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.006666666666666667, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.013333333333333334, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.006666666666666667, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.006666666666666667, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.02 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.006666666666666667 + } + }, + "AC3_2": 0.252967676355578, + "AC3_3": 0.11426104988201102, + "AC3_4": 0.051578501334094286, + "AC3_5": 0.02815754872313141, + "AC3_6": 0.018347338928491615, + "AC3_7": 0.012986369263828122 + }, + "prompt_4": { + "overall_acc": 0.24380952380952378, + "language_acc": { + "English": 0.2866666666666667, + "Vietnamese": 0.26, + "Malay": 0.18666666666666668, + "Indonesian": 0.23333333333333334, + "Spanish": 0.26666666666666666, + "Chinese": 0.26666666666666666, + "Filipino": 0.20666666666666667 + }, + "consistency_score_2": 0.33523809523809533, + "consistency_score_3": 0.14457142857142857, + "consistency_score_4": 0.07752380952380955, + "consistency_score_5": 0.05047619047619048, + "consistency_score_6": 0.039047619047619046, + "consistency_score_7": 0.03333333333333333, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.48, + "English,Malay": 0.28, + "English,Indonesian": 0.32, + "English,Spanish": 0.3333333333333333, + "English,Chinese": 0.4066666666666667, + "English,Filipino": 0.37333333333333335, + "Vietnamese,Malay": 0.3333333333333333, + "Vietnamese,Indonesian": 0.30666666666666664, + "Vietnamese,Spanish": 0.34, + "Vietnamese,Chinese": 0.38, + "Vietnamese,Filipino": 0.34, + "Malay,Indonesian": 0.31333333333333335, + "Malay,Spanish": 0.35333333333333333, + "Malay,Chinese": 0.32666666666666666, + "Malay,Filipino": 0.28, + "Indonesian,Spanish": 0.30666666666666664, + "Indonesian,Chinese": 0.36, + "Indonesian,Filipino": 0.28, + "Spanish,Chinese": 0.32666666666666666, + "Spanish,Filipino": 0.28, + "Chinese,Filipino": 0.32 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.17333333333333334, + "English,Vietnamese,Indonesian": 0.16666666666666666, + "English,Vietnamese,Spanish": 0.20666666666666667, + "English,Vietnamese,Chinese": 0.24666666666666667, + "English,Vietnamese,Filipino": 0.2, + "English,Malay,Indonesian": 0.12, + "English,Malay,Spanish": 0.13333333333333333, + "English,Malay,Chinese": 0.15333333333333332, + "English,Malay,Filipino": 0.12, + "English,Indonesian,Spanish": 0.12, + "English,Indonesian,Chinese": 0.17333333333333334, + "English,Indonesian,Filipino": 0.13333333333333333, + "English,Spanish,Chinese": 0.17333333333333334, + "English,Spanish,Filipino": 0.13333333333333333, + "English,Chinese,Filipino": 0.18666666666666668, + "Vietnamese,Malay,Indonesian": 0.14, + "Vietnamese,Malay,Spanish": 0.14, + "Vietnamese,Malay,Chinese": 0.13333333333333333, + "Vietnamese,Malay,Filipino": 0.12666666666666668, + "Vietnamese,Indonesian,Spanish": 0.12666666666666668, + "Vietnamese,Indonesian,Chinese": 0.14666666666666667, + "Vietnamese,Indonesian,Filipino": 0.11333333333333333, + "Vietnamese,Spanish,Chinese": 0.14666666666666667, + "Vietnamese,Spanish,Filipino": 0.13333333333333333, + "Vietnamese,Chinese,Filipino": 0.18, + "Malay,Indonesian,Spanish": 0.13333333333333333, + "Malay,Indonesian,Chinese": 0.14, + "Malay,Indonesian,Filipino": 0.08666666666666667, + "Malay,Spanish,Chinese": 0.13333333333333333, + "Malay,Spanish,Filipino": 0.12, + "Malay,Chinese,Filipino": 0.13333333333333333, + "Indonesian,Spanish,Chinese": 0.14, + "Indonesian,Spanish,Filipino": 0.10666666666666667, + "Indonesian,Chinese,Filipino": 0.12666666666666668, + "Spanish,Chinese,Filipino": 0.11333333333333333 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.08666666666666667, + "English,Vietnamese,Malay,Spanish": 0.08, + "English,Vietnamese,Malay,Chinese": 0.10666666666666667, + "English,Vietnamese,Malay,Filipino": 0.08666666666666667, + "English,Vietnamese,Indonesian,Spanish": 0.08666666666666667, + "English,Vietnamese,Indonesian,Chinese": 0.11333333333333333, + "English,Vietnamese,Indonesian,Filipino": 0.06666666666666667, + "English,Vietnamese,Spanish,Chinese": 0.12666666666666668, + "English,Vietnamese,Spanish,Filipino": 0.09333333333333334, + "English,Vietnamese,Chinese,Filipino": 0.12666666666666668, + "English,Malay,Indonesian,Spanish": 0.07333333333333333, + "English,Malay,Indonesian,Chinese": 0.08666666666666667, + "English,Malay,Indonesian,Filipino": 0.06666666666666667, + "English,Malay,Spanish,Chinese": 0.08, + "English,Malay,Spanish,Filipino": 0.06, + "English,Malay,Chinese,Filipino": 0.08666666666666667, + "English,Indonesian,Spanish,Chinese": 0.08, + "English,Indonesian,Spanish,Filipino": 0.06, + "English,Indonesian,Chinese,Filipino": 0.09333333333333334, + "English,Spanish,Chinese,Filipino": 0.06666666666666667, + "Vietnamese,Malay,Indonesian,Spanish": 0.08, + "Vietnamese,Malay,Indonesian,Chinese": 0.07333333333333333, + "Vietnamese,Malay,Indonesian,Filipino": 0.06, + "Vietnamese,Malay,Spanish,Chinese": 0.06666666666666667, + "Vietnamese,Malay,Spanish,Filipino": 0.06, + "Vietnamese,Malay,Chinese,Filipino": 0.08, + "Vietnamese,Indonesian,Spanish,Chinese": 0.07333333333333333, + "Vietnamese,Indonesian,Spanish,Filipino": 0.06, + "Vietnamese,Indonesian,Chinese,Filipino": 0.07333333333333333, + "Vietnamese,Spanish,Chinese,Filipino": 0.06666666666666667, + "Malay,Indonesian,Spanish,Chinese": 0.06, + "Malay,Indonesian,Spanish,Filipino": 0.06, + "Malay,Indonesian,Chinese,Filipino": 0.06, + "Malay,Spanish,Chinese,Filipino": 0.06, + "Indonesian,Spanish,Chinese,Filipino": 0.05333333333333334 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.05333333333333334, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.06666666666666667, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.05333333333333334, + "English,Vietnamese,Malay,Spanish,Chinese": 0.06, + "English,Vietnamese,Malay,Spanish,Filipino": 0.04666666666666667, + "English,Vietnamese,Malay,Chinese,Filipino": 0.07333333333333333, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.06666666666666667, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.04, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.06, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.05333333333333334, + "English,Malay,Indonesian,Spanish,Chinese": 0.05333333333333334, + "English,Malay,Indonesian,Spanish,Filipino": 0.04666666666666667, + "English,Malay,Indonesian,Chinese,Filipino": 0.05333333333333334, + "English,Malay,Spanish,Chinese,Filipino": 0.04, + "English,Indonesian,Spanish,Chinese,Filipino": 0.04, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.04666666666666667, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.04666666666666667, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.04666666666666667, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.04, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.04, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.03333333333333333 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.04666666666666667, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.04, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.04666666666666667, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.04, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.03333333333333333, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.03333333333333333, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.03333333333333333 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.03333333333333333 + } + }, + "AC3_2": 0.2823057643622741, + "AC3_3": 0.18151194558487657, + "AC3_4": 0.11764134695861926, + "AC3_5": 0.08363692399685323, + "AC3_6": 0.06731441395728262, + "AC3_7": 0.05864833903954842 + }, + "prompt_5": { + "overall_acc": 0.25999999999999995, + "language_acc": { + "English": 0.22666666666666666, + "Vietnamese": 0.29333333333333333, + "Malay": 0.26666666666666666, + "Indonesian": 0.22, + "Spanish": 0.2866666666666667, + "Chinese": 0.24666666666666667, + "Filipino": 0.28 + }, + "consistency_score_2": 0.3022222222222222, + "consistency_score_3": 0.10114285714285716, + "consistency_score_4": 0.035809523809523805, + "consistency_score_5": 0.013015873015873012, + "consistency_score_6": 0.00380952380952381, + "consistency_score_7": 0.0, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.22666666666666666, + "English,Malay": 0.3, + "English,Indonesian": 0.29333333333333333, + "English,Spanish": 0.35333333333333333, + "English,Chinese": 0.31333333333333335, + "English,Filipino": 0.31333333333333335, + "Vietnamese,Malay": 0.21333333333333335, + "Vietnamese,Indonesian": 0.23333333333333334, + "Vietnamese,Spanish": 0.4266666666666667, + "Vietnamese,Chinese": 0.37333333333333335, + "Vietnamese,Filipino": 0.42, + "Malay,Indonesian": 0.26, + "Malay,Spanish": 0.26666666666666666, + "Malay,Chinese": 0.23333333333333334, + "Malay,Filipino": 0.22, + "Indonesian,Spanish": 0.22666666666666666, + "Indonesian,Chinese": 0.23333333333333334, + "Indonesian,Filipino": 0.26666666666666666, + "Spanish,Chinese": 0.41333333333333333, + "Spanish,Filipino": 0.41333333333333333, + "Chinese,Filipino": 0.3466666666666667 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.06666666666666667, + "English,Vietnamese,Indonesian": 0.04666666666666667, + "English,Vietnamese,Spanish": 0.1, + "English,Vietnamese,Chinese": 0.08666666666666667, + "English,Vietnamese,Filipino": 0.10666666666666667, + "English,Malay,Indonesian": 0.09333333333333334, + "English,Malay,Spanish": 0.11333333333333333, + "English,Malay,Chinese": 0.1, + "English,Malay,Filipino": 0.09333333333333334, + "English,Indonesian,Spanish": 0.08666666666666667, + "English,Indonesian,Chinese": 0.08, + "English,Indonesian,Filipino": 0.08666666666666667, + "English,Spanish,Chinese": 0.17333333333333334, + "English,Spanish,Filipino": 0.15333333333333332, + "English,Chinese,Filipino": 0.08666666666666667, + "Vietnamese,Malay,Indonesian": 0.06, + "Vietnamese,Malay,Spanish": 0.1, + "Vietnamese,Malay,Chinese": 0.09333333333333334, + "Vietnamese,Malay,Filipino": 0.06, + "Vietnamese,Indonesian,Spanish": 0.08666666666666667, + "Vietnamese,Indonesian,Chinese": 0.09333333333333334, + "Vietnamese,Indonesian,Filipino": 0.07333333333333333, + "Vietnamese,Spanish,Chinese": 0.2, + "Vietnamese,Spanish,Filipino": 0.22666666666666666, + "Vietnamese,Chinese,Filipino": 0.18666666666666668, + "Malay,Indonesian,Spanish": 0.07333333333333333, + "Malay,Indonesian,Chinese": 0.06, + "Malay,Indonesian,Filipino": 0.06666666666666667, + "Malay,Spanish,Chinese": 0.09333333333333334, + "Malay,Spanish,Filipino": 0.1, + "Malay,Chinese,Filipino": 0.06, + "Indonesian,Spanish,Chinese": 0.08666666666666667, + "Indonesian,Spanish,Filipino": 0.08666666666666667, + "Indonesian,Chinese,Filipino": 0.08, + "Spanish,Chinese,Filipino": 0.18 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.02, + "English,Vietnamese,Malay,Spanish": 0.04, + "English,Vietnamese,Malay,Chinese": 0.03333333333333333, + "English,Vietnamese,Malay,Filipino": 0.02, + "English,Vietnamese,Indonesian,Spanish": 0.02666666666666667, + "English,Vietnamese,Indonesian,Chinese": 0.02666666666666667, + "English,Vietnamese,Indonesian,Filipino": 0.006666666666666667, + "English,Vietnamese,Spanish,Chinese": 0.05333333333333334, + "English,Vietnamese,Spanish,Filipino": 0.05333333333333334, + "English,Vietnamese,Chinese,Filipino": 0.03333333333333333, + "English,Malay,Indonesian,Spanish": 0.03333333333333333, + "English,Malay,Indonesian,Chinese": 0.03333333333333333, + "English,Malay,Indonesian,Filipino": 0.03333333333333333, + "English,Malay,Spanish,Chinese": 0.04666666666666667, + "English,Malay,Spanish,Filipino": 0.04666666666666667, + "English,Malay,Chinese,Filipino": 0.02, + "English,Indonesian,Spanish,Chinese": 0.03333333333333333, + "English,Indonesian,Spanish,Filipino": 0.02666666666666667, + "English,Indonesian,Chinese,Filipino": 0.013333333333333334, + "English,Spanish,Chinese,Filipino": 0.06, + "Vietnamese,Malay,Indonesian,Spanish": 0.03333333333333333, + "Vietnamese,Malay,Indonesian,Chinese": 0.03333333333333333, + "Vietnamese,Malay,Indonesian,Filipino": 0.006666666666666667, + "Vietnamese,Malay,Spanish,Chinese": 0.05333333333333334, + "Vietnamese,Malay,Spanish,Filipino": 0.02666666666666667, + "Vietnamese,Malay,Chinese,Filipino": 0.03333333333333333, + "Vietnamese,Indonesian,Spanish,Chinese": 0.05333333333333334, + "Vietnamese,Indonesian,Spanish,Filipino": 0.04, + "Vietnamese,Indonesian,Chinese,Filipino": 0.04666666666666667, + "Vietnamese,Spanish,Chinese,Filipino": 0.12, + "Malay,Indonesian,Spanish,Chinese": 0.03333333333333333, + "Malay,Indonesian,Spanish,Filipino": 0.02666666666666667, + "Malay,Indonesian,Chinese,Filipino": 0.02, + "Malay,Spanish,Chinese,Filipino": 0.02666666666666667, + "Indonesian,Spanish,Chinese,Filipino": 0.04 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.02, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.02, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.0, + "English,Vietnamese,Malay,Spanish,Chinese": 0.02666666666666667, + "English,Vietnamese,Malay,Spanish,Filipino": 0.006666666666666667, + "English,Vietnamese,Malay,Chinese,Filipino": 0.006666666666666667, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.02, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.0, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.0, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.02666666666666667, + "English,Malay,Indonesian,Spanish,Chinese": 0.02, + "English,Malay,Indonesian,Spanish,Filipino": 0.013333333333333334, + "English,Malay,Indonesian,Chinese,Filipino": 0.006666666666666667, + "English,Malay,Spanish,Chinese,Filipino": 0.006666666666666667, + "English,Indonesian,Spanish,Chinese,Filipino": 0.0, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.02666666666666667, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.006666666666666667, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.006666666666666667, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.013333333333333334, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.03333333333333333, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.013333333333333334 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.02, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.0, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.0, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.0, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.0, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.0, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.006666666666666667 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.0 + } + }, + "AC3_2": 0.27952569164988667, + "AC3_3": 0.1456329113520795, + "AC3_4": 0.06294913069346536, + "AC3_5": 0.02479069766533828, + "AC3_6": 0.007509025267911742, + "AC3_7": 0.0 + } + }, + "cross_logiqa": { + "prompt_1": { + "overall_acc": 0.2564935064935065, + "language_acc": { + "Vietnamese": 0.2840909090909091, + "Indonesian": 0.26704545454545453, + "Malay": 0.24431818181818182, + "English": 0.2215909090909091, + "Spanish": 0.25, + "Filipino": 0.26136363636363635, + "Chinese": 0.26704545454545453 + }, + "consistency_score_2": 0.38041125541125537, + "consistency_score_3": 0.17905844155844156, + "consistency_score_4": 0.09740259740259744, + "consistency_score_5": 0.05979437229437228, + "consistency_score_6": 0.041396103896103896, + "consistency_score_7": 0.03409090909090909, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.3977272727272727, + "Vietnamese,Malay": 0.44886363636363635, + "Vietnamese,English": 0.3806818181818182, + "Vietnamese,Spanish": 0.3352272727272727, + "Vietnamese,Filipino": 0.39204545454545453, + "Vietnamese,Chinese": 0.36363636363636365, + "Indonesian,Malay": 0.4943181818181818, + "Indonesian,English": 0.42045454545454547, + "Indonesian,Spanish": 0.35795454545454547, + "Indonesian,Filipino": 0.3977272727272727, + "Indonesian,Chinese": 0.375, + "Malay,English": 0.4375, + "Malay,Spanish": 0.4034090909090909, + "Malay,Filipino": 0.42613636363636365, + "Malay,Chinese": 0.3465909090909091, + "English,Spanish": 0.3125, + "English,Filipino": 0.3181818181818182, + "English,Chinese": 0.4431818181818182, + "Spanish,Filipino": 0.36363636363636365, + "Spanish,Chinese": 0.30113636363636365, + "Filipino,Chinese": 0.2727272727272727 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.25, + "Vietnamese,Indonesian,English": 0.17613636363636365, + "Vietnamese,Indonesian,Spanish": 0.14772727272727273, + "Vietnamese,Indonesian,Filipino": 0.20454545454545456, + "Vietnamese,Indonesian,Chinese": 0.17613636363636365, + "Vietnamese,Malay,English": 0.22727272727272727, + "Vietnamese,Malay,Spanish": 0.19886363636363635, + "Vietnamese,Malay,Filipino": 0.24431818181818182, + "Vietnamese,Malay,Chinese": 0.17045454545454544, + "Vietnamese,English,Spanish": 0.14204545454545456, + "Vietnamese,English,Filipino": 0.14204545454545456, + "Vietnamese,English,Chinese": 0.19318181818181818, + "Vietnamese,Spanish,Filipino": 0.19318181818181818, + "Vietnamese,Spanish,Chinese": 0.10227272727272728, + "Vietnamese,Filipino,Chinese": 0.13636363636363635, + "Indonesian,Malay,English": 0.26136363636363635, + "Indonesian,Malay,Spanish": 0.2159090909090909, + "Indonesian,Malay,Filipino": 0.24431818181818182, + "Indonesian,Malay,Chinese": 0.19318181818181818, + "Indonesian,English,Spanish": 0.17045454545454544, + "Indonesian,English,Filipino": 0.17613636363636365, + "Indonesian,English,Chinese": 0.2159090909090909, + "Indonesian,Spanish,Filipino": 0.17045454545454544, + "Indonesian,Spanish,Chinese": 0.125, + "Indonesian,Filipino,Chinese": 0.14204545454545456, + "Malay,English,Spanish": 0.1875, + "Malay,English,Filipino": 0.19886363636363635, + "Malay,English,Chinese": 0.19886363636363635, + "Malay,Spanish,Filipino": 0.1875, + "Malay,Spanish,Chinese": 0.14772727272727273, + "Malay,Filipino,Chinese": 0.1590909090909091, + "English,Spanish,Filipino": 0.14204545454545456, + "English,Spanish,Chinese": 0.1590909090909091, + "English,Filipino,Chinese": 0.14772727272727273, + "Spanish,Filipino,Chinese": 0.11931818181818182 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.13636363636363635, + "Vietnamese,Indonesian,Malay,Spanish": 0.10795454545454546, + "Vietnamese,Indonesian,Malay,Filipino": 0.16477272727272727, + "Vietnamese,Indonesian,Malay,Chinese": 0.09659090909090909, + "Vietnamese,Indonesian,English,Spanish": 0.07386363636363637, + "Vietnamese,Indonesian,English,Filipino": 0.09090909090909091, + "Vietnamese,Indonesian,English,Chinese": 0.10227272727272728, + "Vietnamese,Indonesian,Spanish,Filipino": 0.10795454545454546, + "Vietnamese,Indonesian,Spanish,Chinese": 0.03977272727272727, + "Vietnamese,Indonesian,Filipino,Chinese": 0.07954545454545454, + "Vietnamese,Malay,English,Spanish": 0.10795454545454546, + "Vietnamese,Malay,English,Filipino": 0.125, + "Vietnamese,Malay,English,Chinese": 0.11931818181818182, + "Vietnamese,Malay,Spanish,Filipino": 0.13636363636363635, + "Vietnamese,Malay,Spanish,Chinese": 0.07386363636363637, + "Vietnamese,Malay,Filipino,Chinese": 0.09090909090909091, + "Vietnamese,English,Spanish,Filipino": 0.07954545454545454, + "Vietnamese,English,Spanish,Chinese": 0.056818181818181816, + "Vietnamese,English,Filipino,Chinese": 0.07954545454545454, + "Vietnamese,Spanish,Filipino,Chinese": 0.056818181818181816, + "Indonesian,Malay,English,Spanish": 0.11931818181818182, + "Indonesian,Malay,English,Filipino": 0.125, + "Indonesian,Malay,English,Chinese": 0.125, + "Indonesian,Malay,Spanish,Filipino": 0.11363636363636363, + "Indonesian,Malay,Spanish,Chinese": 0.08522727272727272, + "Indonesian,Malay,Filipino,Chinese": 0.09090909090909091, + "Indonesian,English,Spanish,Filipino": 0.08522727272727272, + "Indonesian,English,Spanish,Chinese": 0.09659090909090909, + "Indonesian,English,Filipino,Chinese": 0.09090909090909091, + "Indonesian,Spanish,Filipino,Chinese": 0.06818181818181818, + "Malay,English,Spanish,Filipino": 0.10227272727272728, + "Malay,English,Spanish,Chinese": 0.09659090909090909, + "Malay,English,Filipino,Chinese": 0.11363636363636363, + "Malay,Spanish,Filipino,Chinese": 0.07954545454545454, + "English,Spanish,Filipino,Chinese": 0.09090909090909091 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.06818181818181818, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.08522727272727272, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.07386363636363637, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.09090909090909091, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.03409090909090909, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.056818181818181816, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.05113636363636364, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.03409090909090909, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.05113636363636364, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.03409090909090909, + "Vietnamese,Malay,English,Spanish,Filipino": 0.07386363636363637, + "Vietnamese,Malay,English,Spanish,Chinese": 0.05113636363636364, + "Vietnamese,Malay,English,Filipino,Chinese": 0.07386363636363637, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.05113636363636364, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.045454545454545456, + "Indonesian,Malay,English,Spanish,Filipino": 0.0625, + "Indonesian,Malay,English,Spanish,Chinese": 0.06818181818181818, + "Indonesian,Malay,English,Filipino,Chinese": 0.06818181818181818, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.05113636363636364, + "Indonesian,English,Spanish,Filipino,Chinese": 0.0625, + "Malay,English,Spanish,Filipino,Chinese": 0.06818181818181818 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.05113636363636364, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.03409090909090909, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.045454545454545456, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.03409090909090909, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.03409090909090909, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.045454545454545456, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.045454545454545456 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.03409090909090909 + } + }, + "AC3_2": 0.3063975106824536, + "AC3_3": 0.21089253641251043, + "AC3_4": 0.14118908610331898, + "AC3_5": 0.09698043612855935, + "AC3_6": 0.07128702357916922, + "AC3_7": 0.06018283390513971 + }, + "prompt_2": { + "overall_acc": 0.2524350649350649, + "language_acc": { + "Vietnamese": 0.2215909090909091, + "Indonesian": 0.23863636363636365, + "Malay": 0.2556818181818182, + "English": 0.2727272727272727, + "Spanish": 0.2727272727272727, + "Filipino": 0.2556818181818182, + "Chinese": 0.25 + }, + "consistency_score_2": 0.3728354978354978, + "consistency_score_3": 0.17581168831168834, + "consistency_score_4": 0.09123376623376624, + "consistency_score_5": 0.04734848484848484, + "consistency_score_6": 0.021915584415584416, + "consistency_score_7": 0.005681818181818182, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.38636363636363635, + "Vietnamese,Malay": 0.5, + "Vietnamese,English": 0.42045454545454547, + "Vietnamese,Spanish": 0.25, + "Vietnamese,Filipino": 0.3522727272727273, + "Vietnamese,Chinese": 0.3409090909090909, + "Indonesian,Malay": 0.42613636363636365, + "Indonesian,English": 0.4318181818181818, + "Indonesian,Spanish": 0.32954545454545453, + "Indonesian,Filipino": 0.3181818181818182, + "Indonesian,Chinese": 0.36363636363636365, + "Malay,English": 0.4772727272727273, + "Malay,Spanish": 0.3181818181818182, + "Malay,Filipino": 0.42613636363636365, + "Malay,Chinese": 0.3693181818181818, + "English,Spanish": 0.32954545454545453, + "English,Filipino": 0.3977272727272727, + "English,Chinese": 0.42045454545454547, + "Spanish,Filipino": 0.3465909090909091, + "Spanish,Chinese": 0.3125, + "Filipino,Chinese": 0.3125 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.26136363636363635, + "Vietnamese,Indonesian,English": 0.21022727272727273, + "Vietnamese,Indonesian,Spanish": 0.11931818181818182, + "Vietnamese,Indonesian,Filipino": 0.17045454545454544, + "Vietnamese,Indonesian,Chinese": 0.1875, + "Vietnamese,Malay,English": 0.3125, + "Vietnamese,Malay,Spanish": 0.16477272727272727, + "Vietnamese,Malay,Filipino": 0.24431818181818182, + "Vietnamese,Malay,Chinese": 0.20454545454545456, + "Vietnamese,English,Spanish": 0.13068181818181818, + "Vietnamese,English,Filipino": 0.1875, + "Vietnamese,English,Chinese": 0.19318181818181818, + "Vietnamese,Spanish,Filipino": 0.11363636363636363, + "Vietnamese,Spanish,Chinese": 0.10227272727272728, + "Vietnamese,Filipino,Chinese": 0.13068181818181818, + "Indonesian,Malay,English": 0.2727272727272727, + "Indonesian,Malay,Spanish": 0.17045454545454544, + "Indonesian,Malay,Filipino": 0.21022727272727273, + "Indonesian,Malay,Chinese": 0.2159090909090909, + "Indonesian,English,Spanish": 0.14772727272727273, + "Indonesian,English,Filipino": 0.14772727272727273, + "Indonesian,English,Chinese": 0.2159090909090909, + "Indonesian,Spanish,Filipino": 0.11931818181818182, + "Indonesian,Spanish,Chinese": 0.14772727272727273, + "Indonesian,Filipino,Chinese": 0.13068181818181818, + "Malay,English,Spanish": 0.16477272727272727, + "Malay,English,Filipino": 0.24431818181818182, + "Malay,English,Chinese": 0.22727272727272727, + "Malay,Spanish,Filipino": 0.1590909090909091, + "Malay,Spanish,Chinese": 0.13068181818181818, + "Malay,Filipino,Chinese": 0.1590909090909091, + "English,Spanish,Filipino": 0.14204545454545456, + "English,Spanish,Chinese": 0.13636363636363635, + "English,Filipino,Chinese": 0.1534090909090909, + "Spanish,Filipino,Chinese": 0.125 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.19318181818181818, + "Vietnamese,Indonesian,Malay,Spanish": 0.09090909090909091, + "Vietnamese,Indonesian,Malay,Filipino": 0.13068181818181818, + "Vietnamese,Indonesian,Malay,Chinese": 0.13636363636363635, + "Vietnamese,Indonesian,English,Spanish": 0.0625, + "Vietnamese,Indonesian,English,Filipino": 0.10227272727272728, + "Vietnamese,Indonesian,English,Chinese": 0.11931818181818182, + "Vietnamese,Indonesian,Spanish,Filipino": 0.0625, + "Vietnamese,Indonesian,Spanish,Chinese": 0.0625, + "Vietnamese,Indonesian,Filipino,Chinese": 0.07954545454545454, + "Vietnamese,Malay,English,Spanish": 0.10227272727272728, + "Vietnamese,Malay,English,Filipino": 0.1534090909090909, + "Vietnamese,Malay,English,Chinese": 0.14772727272727273, + "Vietnamese,Malay,Spanish,Filipino": 0.09090909090909091, + "Vietnamese,Malay,Spanish,Chinese": 0.0625, + "Vietnamese,Malay,Filipino,Chinese": 0.09659090909090909, + "Vietnamese,English,Spanish,Filipino": 0.0625, + "Vietnamese,English,Spanish,Chinese": 0.0625, + "Vietnamese,English,Filipino,Chinese": 0.08522727272727272, + "Vietnamese,Spanish,Filipino,Chinese": 0.05113636363636364, + "Indonesian,Malay,English,Spanish": 0.09659090909090909, + "Indonesian,Malay,English,Filipino": 0.13068181818181818, + "Indonesian,Malay,English,Chinese": 0.1590909090909091, + "Indonesian,Malay,Spanish,Filipino": 0.06818181818181818, + "Indonesian,Malay,Spanish,Chinese": 0.08522727272727272, + "Indonesian,Malay,Filipino,Chinese": 0.09659090909090909, + "Indonesian,English,Spanish,Filipino": 0.045454545454545456, + "Indonesian,English,Spanish,Chinese": 0.06818181818181818, + "Indonesian,English,Filipino,Chinese": 0.07386363636363637, + "Indonesian,Spanish,Filipino,Chinese": 0.056818181818181816, + "Malay,English,Spanish,Filipino": 0.07954545454545454, + "Malay,English,Spanish,Chinese": 0.07386363636363637, + "Malay,English,Filipino,Chinese": 0.09659090909090909, + "Malay,Spanish,Filipino,Chinese": 0.0625, + "English,Spanish,Filipino,Chinese": 0.045454545454545456 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.056818181818181816, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.09659090909090909, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.11363636363636363, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.045454545454545456, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.03409090909090909, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.0625, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.03409090909090909, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.028409090909090908, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.056818181818181816, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.028409090909090908, + "Vietnamese,Malay,English,Spanish,Filipino": 0.05113636363636364, + "Vietnamese,Malay,English,Spanish,Chinese": 0.045454545454545456, + "Vietnamese,Malay,English,Filipino,Chinese": 0.06818181818181818, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.03409090909090909, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.028409090909090908, + "Indonesian,Malay,English,Spanish,Filipino": 0.03409090909090909, + "Indonesian,Malay,English,Spanish,Chinese": 0.045454545454545456, + "Indonesian,Malay,English,Filipino,Chinese": 0.0625, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.028409090909090908, + "Indonesian,English,Spanish,Filipino,Chinese": 0.011363636363636364, + "Malay,English,Spanish,Filipino,Chinese": 0.028409090909090908 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.028409090909090908, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.022727272727272728, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.05113636363636364, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.011363636363636364, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.011363636363636364, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.022727272727272728, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.005681818181818182 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.005681818181818182 + } + }, + "AC3_2": 0.301043288121927, + "AC3_3": 0.20726851801428098, + "AC3_4": 0.1340278757224619, + "AC3_5": 0.0797403183193147, + "AC3_6": 0.04032986243054694, + "AC3_7": 0.011113493420508574 + }, + "prompt_3": { + "overall_acc": 0.2483766233766234, + "language_acc": { + "Vietnamese": 0.24431818181818182, + "Indonesian": 0.29545454545454547, + "Malay": 0.22727272727272727, + "English": 0.2556818181818182, + "Spanish": 0.2727272727272727, + "Filipino": 0.23863636363636365, + "Chinese": 0.20454545454545456 + }, + "consistency_score_2": 0.2532467532467533, + "consistency_score_3": 0.06461038961038959, + "consistency_score_4": 0.017045454545454537, + "consistency_score_5": 0.003787878787878787, + "consistency_score_6": 0.0, + "consistency_score_7": 0.0, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.23295454545454544, + "Vietnamese,Malay": 0.24431818181818182, + "Vietnamese,English": 0.25, + "Vietnamese,Spanish": 0.26136363636363635, + "Vietnamese,Filipino": 0.2727272727272727, + "Vietnamese,Chinese": 0.2897727272727273, + "Indonesian,Malay": 0.25, + "Indonesian,English": 0.24431818181818182, + "Indonesian,Spanish": 0.24431818181818182, + "Indonesian,Filipino": 0.2727272727272727, + "Indonesian,Chinese": 0.25, + "Malay,English": 0.24431818181818182, + "Malay,Spanish": 0.24431818181818182, + "Malay,Filipino": 0.23863636363636365, + "Malay,Chinese": 0.23295454545454544, + "English,Spanish": 0.3181818181818182, + "English,Filipino": 0.1875, + "English,Chinese": 0.26136363636363635, + "Spanish,Filipino": 0.3068181818181818, + "Spanish,Chinese": 0.1875, + "Filipino,Chinese": 0.2840909090909091 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.05113636363636364, + "Vietnamese,Indonesian,English": 0.05113636363636364, + "Vietnamese,Indonesian,Spanish": 0.0625, + "Vietnamese,Indonesian,Filipino": 0.07386363636363637, + "Vietnamese,Indonesian,Chinese": 0.045454545454545456, + "Vietnamese,Malay,English": 0.07386363636363637, + "Vietnamese,Malay,Spanish": 0.056818181818181816, + "Vietnamese,Malay,Filipino": 0.056818181818181816, + "Vietnamese,Malay,Chinese": 0.056818181818181816, + "Vietnamese,English,Spanish": 0.07386363636363637, + "Vietnamese,English,Filipino": 0.045454545454545456, + "Vietnamese,English,Chinese": 0.0625, + "Vietnamese,Spanish,Filipino": 0.08522727272727272, + "Vietnamese,Spanish,Chinese": 0.09659090909090909, + "Vietnamese,Filipino,Chinese": 0.08522727272727272, + "Indonesian,Malay,English": 0.03409090909090909, + "Indonesian,Malay,Spanish": 0.0625, + "Indonesian,Malay,Filipino": 0.0625, + "Indonesian,Malay,Chinese": 0.06818181818181818, + "Indonesian,English,Spanish": 0.06818181818181818, + "Indonesian,English,Filipino": 0.05113636363636364, + "Indonesian,English,Chinese": 0.056818181818181816, + "Indonesian,Spanish,Filipino": 0.07954545454545454, + "Indonesian,Spanish,Chinese": 0.045454545454545456, + "Indonesian,Filipino,Chinese": 0.06818181818181818, + "Malay,English,Spanish": 0.10795454545454546, + "Malay,English,Filipino": 0.06818181818181818, + "Malay,English,Chinese": 0.06818181818181818, + "Malay,Spanish,Filipino": 0.06818181818181818, + "Malay,Spanish,Chinese": 0.03977272727272727, + "Malay,Filipino,Chinese": 0.0625, + "English,Spanish,Filipino": 0.09090909090909091, + "English,Spanish,Chinese": 0.0625, + "English,Filipino,Chinese": 0.056818181818181816, + "Spanish,Filipino,Chinese": 0.0625 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.017045454545454544, + "Vietnamese,Indonesian,Malay,Spanish": 0.011363636363636364, + "Vietnamese,Indonesian,Malay,Filipino": 0.005681818181818182, + "Vietnamese,Indonesian,Malay,Chinese": 0.011363636363636364, + "Vietnamese,Indonesian,English,Spanish": 0.005681818181818182, + "Vietnamese,Indonesian,English,Filipino": 0.005681818181818182, + "Vietnamese,Indonesian,English,Chinese": 0.005681818181818182, + "Vietnamese,Indonesian,Spanish,Filipino": 0.017045454545454544, + "Vietnamese,Indonesian,Spanish,Chinese": 0.017045454545454544, + "Vietnamese,Indonesian,Filipino,Chinese": 0.011363636363636364, + "Vietnamese,Malay,English,Spanish": 0.03409090909090909, + "Vietnamese,Malay,English,Filipino": 0.028409090909090908, + "Vietnamese,Malay,English,Chinese": 0.022727272727272728, + "Vietnamese,Malay,Spanish,Filipino": 0.017045454545454544, + "Vietnamese,Malay,Spanish,Chinese": 0.011363636363636364, + "Vietnamese,Malay,Filipino,Chinese": 0.011363636363636364, + "Vietnamese,English,Spanish,Filipino": 0.028409090909090908, + "Vietnamese,English,Spanish,Chinese": 0.028409090909090908, + "Vietnamese,English,Filipino,Chinese": 0.017045454545454544, + "Vietnamese,Spanish,Filipino,Chinese": 0.028409090909090908, + "Indonesian,Malay,English,Spanish": 0.022727272727272728, + "Indonesian,Malay,English,Filipino": 0.011363636363636364, + "Indonesian,Malay,English,Chinese": 0.011363636363636364, + "Indonesian,Malay,Spanish,Filipino": 0.022727272727272728, + "Indonesian,Malay,Spanish,Chinese": 0.017045454545454544, + "Indonesian,Malay,Filipino,Chinese": 0.011363636363636364, + "Indonesian,English,Spanish,Filipino": 0.017045454545454544, + "Indonesian,English,Spanish,Chinese": 0.005681818181818182, + "Indonesian,English,Filipino,Chinese": 0.011363636363636364, + "Indonesian,Spanish,Filipino,Chinese": 0.017045454545454544, + "Malay,English,Spanish,Filipino": 0.03409090909090909, + "Malay,English,Spanish,Chinese": 0.022727272727272728, + "Malay,English,Filipino,Chinese": 0.017045454545454544, + "Malay,Spanish,Filipino,Chinese": 0.017045454545454544, + "English,Spanish,Filipino,Chinese": 0.022727272727272728 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.005681818181818182, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.005681818181818182, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.005681818181818182, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.0, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.005681818181818182, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.0, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.0, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.0, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.0, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.0, + "Vietnamese,Malay,English,Spanish,Filipino": 0.011363636363636364, + "Vietnamese,Malay,English,Spanish,Chinese": 0.005681818181818182, + "Vietnamese,Malay,English,Filipino,Chinese": 0.005681818181818182, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.0, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.011363636363636364, + "Indonesian,Malay,English,Spanish,Filipino": 0.005681818181818182, + "Indonesian,Malay,English,Spanish,Chinese": 0.005681818181818182, + "Indonesian,Malay,English,Filipino,Chinese": 0.0, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.005681818181818182, + "Indonesian,English,Spanish,Filipino,Chinese": 0.0, + "Malay,English,Spanish,Filipino,Chinese": 0.005681818181818182 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.0, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.0, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.0, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.0, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.0, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.0, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.0 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.0 + } + }, + "AC3_2": 0.2507880468545565, + "AC3_3": 0.10254553534472205, + "AC3_4": 0.03190158464185903, + "AC3_5": 0.007461958639256987, + "AC3_6": 0.0, + "AC3_7": 0.0 + }, + "prompt_4": { + "overall_acc": 0.2564935064935065, + "language_acc": { + "Vietnamese": 0.23863636363636365, + "Indonesian": 0.29545454545454547, + "Malay": 0.25, + "English": 0.25, + "Spanish": 0.2556818181818182, + "Filipino": 0.2784090909090909, + "Chinese": 0.22727272727272727 + }, + "consistency_score_2": 0.3027597402597402, + "consistency_score_3": 0.11185064935064933, + "consistency_score_4": 0.05016233766233765, + "consistency_score_5": 0.02732683982683982, + "consistency_score_6": 0.017045454545454544, + "consistency_score_7": 0.011363636363636364, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.3068181818181818, + "Vietnamese,Malay": 0.3125, + "Vietnamese,English": 0.3465909090909091, + "Vietnamese,Spanish": 0.30113636363636365, + "Vietnamese,Filipino": 0.32954545454545453, + "Vietnamese,Chinese": 0.3181818181818182, + "Indonesian,Malay": 0.4034090909090909, + "Indonesian,English": 0.26136363636363635, + "Indonesian,Spanish": 0.2897727272727273, + "Indonesian,Filipino": 0.24431818181818182, + "Indonesian,Chinese": 0.2727272727272727, + "Malay,English": 0.3409090909090909, + "Malay,Spanish": 0.2784090909090909, + "Malay,Filipino": 0.30113636363636365, + "Malay,Chinese": 0.30113636363636365, + "English,Spanish": 0.23863636363636365, + "English,Filipino": 0.2897727272727273, + "English,Chinese": 0.375, + "Spanish,Filipino": 0.3181818181818182, + "Spanish,Chinese": 0.22727272727272727, + "Filipino,Chinese": 0.30113636363636365 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.1534090909090909, + "Vietnamese,Indonesian,English": 0.11931818181818182, + "Vietnamese,Indonesian,Spanish": 0.10227272727272728, + "Vietnamese,Indonesian,Filipino": 0.10227272727272728, + "Vietnamese,Indonesian,Chinese": 0.13636363636363635, + "Vietnamese,Malay,English": 0.1590909090909091, + "Vietnamese,Malay,Spanish": 0.10227272727272728, + "Vietnamese,Malay,Filipino": 0.125, + "Vietnamese,Malay,Chinese": 0.11931818181818182, + "Vietnamese,English,Spanish": 0.11363636363636363, + "Vietnamese,English,Filipino": 0.11363636363636363, + "Vietnamese,English,Chinese": 0.1534090909090909, + "Vietnamese,Spanish,Filipino": 0.11931818181818182, + "Vietnamese,Spanish,Chinese": 0.10795454545454546, + "Vietnamese,Filipino,Chinese": 0.11931818181818182, + "Indonesian,Malay,English": 0.13636363636363635, + "Indonesian,Malay,Spanish": 0.13068181818181818, + "Indonesian,Malay,Filipino": 0.11931818181818182, + "Indonesian,Malay,Chinese": 0.13068181818181818, + "Indonesian,English,Spanish": 0.07386363636363637, + "Indonesian,English,Filipino": 0.06818181818181818, + "Indonesian,English,Chinese": 0.11363636363636363, + "Indonesian,Spanish,Filipino": 0.09090909090909091, + "Indonesian,Spanish,Chinese": 0.07386363636363637, + "Indonesian,Filipino,Chinese": 0.09090909090909091, + "Malay,English,Spanish": 0.10227272727272728, + "Malay,English,Filipino": 0.11931818181818182, + "Malay,English,Chinese": 0.13068181818181818, + "Malay,Spanish,Filipino": 0.10227272727272728, + "Malay,Spanish,Chinese": 0.07954545454545454, + "Malay,Filipino,Chinese": 0.10795454545454546, + "English,Spanish,Filipino": 0.09659090909090909, + "English,Spanish,Chinese": 0.10795454545454546, + "English,Filipino,Chinese": 0.10227272727272728, + "Spanish,Filipino,Chinese": 0.09090909090909091 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.07386363636363637, + "Vietnamese,Indonesian,Malay,Spanish": 0.056818181818181816, + "Vietnamese,Indonesian,Malay,Filipino": 0.056818181818181816, + "Vietnamese,Indonesian,Malay,Chinese": 0.0625, + "Vietnamese,Indonesian,English,Spanish": 0.045454545454545456, + "Vietnamese,Indonesian,English,Filipino": 0.03409090909090909, + "Vietnamese,Indonesian,English,Chinese": 0.06818181818181818, + "Vietnamese,Indonesian,Spanish,Filipino": 0.05113636363636364, + "Vietnamese,Indonesian,Spanish,Chinese": 0.045454545454545456, + "Vietnamese,Indonesian,Filipino,Chinese": 0.0625, + "Vietnamese,Malay,English,Spanish": 0.0625, + "Vietnamese,Malay,English,Filipino": 0.056818181818181816, + "Vietnamese,Malay,English,Chinese": 0.06818181818181818, + "Vietnamese,Malay,Spanish,Filipino": 0.05113636363636364, + "Vietnamese,Malay,Spanish,Chinese": 0.056818181818181816, + "Vietnamese,Malay,Filipino,Chinese": 0.056818181818181816, + "Vietnamese,English,Spanish,Filipino": 0.05113636363636364, + "Vietnamese,English,Spanish,Chinese": 0.056818181818181816, + "Vietnamese,English,Filipino,Chinese": 0.03977272727272727, + "Vietnamese,Spanish,Filipino,Chinese": 0.045454545454545456, + "Indonesian,Malay,English,Spanish": 0.05113636363636364, + "Indonesian,Malay,English,Filipino": 0.045454545454545456, + "Indonesian,Malay,English,Chinese": 0.056818181818181816, + "Indonesian,Malay,Spanish,Filipino": 0.045454545454545456, + "Indonesian,Malay,Spanish,Chinese": 0.045454545454545456, + "Indonesian,Malay,Filipino,Chinese": 0.05113636363636364, + "Indonesian,English,Spanish,Filipino": 0.028409090909090908, + "Indonesian,English,Spanish,Chinese": 0.03409090909090909, + "Indonesian,English,Filipino,Chinese": 0.028409090909090908, + "Indonesian,Spanish,Filipino,Chinese": 0.03977272727272727, + "Malay,English,Spanish,Filipino": 0.05113636363636364, + "Malay,English,Spanish,Chinese": 0.05113636363636364, + "Malay,English,Filipino,Chinese": 0.05113636363636364, + "Malay,Spanish,Filipino,Chinese": 0.03409090909090909, + "English,Spanish,Filipino,Chinese": 0.03977272727272727 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.03409090909090909, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.028409090909090908, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.03409090909090909, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.028409090909090908, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.028409090909090908, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.03409090909090909, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.022727272727272728, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.022727272727272728, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.022727272727272728, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.028409090909090908, + "Vietnamese,Malay,English,Spanish,Filipino": 0.03977272727272727, + "Vietnamese,Malay,English,Spanish,Chinese": 0.03409090909090909, + "Vietnamese,Malay,English,Filipino,Chinese": 0.028409090909090908, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.028409090909090908, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.017045454545454544, + "Indonesian,Malay,English,Spanish,Filipino": 0.028409090909090908, + "Indonesian,Malay,English,Spanish,Chinese": 0.028409090909090908, + "Indonesian,Malay,English,Filipino,Chinese": 0.022727272727272728, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.022727272727272728, + "Indonesian,English,Spanish,Filipino,Chinese": 0.017045454545454544, + "Malay,English,Spanish,Filipino,Chinese": 0.022727272727272728 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.022727272727272728, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.017045454545454544, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.017045454545454544, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.017045454545454544, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.011363636363636364, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.017045454545454544, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.017045454545454544 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.011363636363636364 + } + }, + "AC3_2": 0.2777128531348934, + "AC3_3": 0.15577260989515027, + "AC3_4": 0.08391370405574175, + "AC3_5": 0.04939150456952641, + "AC3_6": 0.03196654975901778, + "AC3_7": 0.02176308539132415 + }, + "prompt_5": { + "overall_acc": 0.25405844155844154, + "language_acc": { + "Vietnamese": 0.2784090909090909, + "Indonesian": 0.2159090909090909, + "Malay": 0.2215909090909091, + "English": 0.2727272727272727, + "Spanish": 0.24431818181818182, + "Filipino": 0.3068181818181818, + "Chinese": 0.23863636363636365 + }, + "consistency_score_2": 0.2827380952380952, + "consistency_score_3": 0.08912337662337663, + "consistency_score_4": 0.03165584415584415, + "consistency_score_5": 0.013257575757575756, + "consistency_score_6": 0.007305194805194807, + "consistency_score_7": 0.005681818181818182, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.2840909090909091, + "Vietnamese,Malay": 0.22727272727272727, + "Vietnamese,English": 0.2215909090909091, + "Vietnamese,Spanish": 0.3068181818181818, + "Vietnamese,Filipino": 0.23295454545454544, + "Vietnamese,Chinese": 0.29545454545454547, + "Indonesian,Malay": 0.2897727272727273, + "Indonesian,English": 0.26704545454545453, + "Indonesian,Spanish": 0.2840909090909091, + "Indonesian,Filipino": 0.25, + "Indonesian,Chinese": 0.3068181818181818, + "Malay,English": 0.2727272727272727, + "Malay,Spanish": 0.25, + "Malay,Filipino": 0.2840909090909091, + "Malay,Chinese": 0.3465909090909091, + "English,Spanish": 0.2727272727272727, + "English,Filipino": 0.2784090909090909, + "English,Chinese": 0.3409090909090909, + "Spanish,Filipino": 0.3522727272727273, + "Spanish,Chinese": 0.3352272727272727, + "Filipino,Chinese": 0.23863636363636365 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.0625, + "Vietnamese,Indonesian,English": 0.06818181818181818, + "Vietnamese,Indonesian,Spanish": 0.07386363636363637, + "Vietnamese,Indonesian,Filipino": 0.07954545454545454, + "Vietnamese,Indonesian,Chinese": 0.08522727272727272, + "Vietnamese,Malay,English": 0.028409090909090908, + "Vietnamese,Malay,Spanish": 0.0625, + "Vietnamese,Malay,Filipino": 0.07954545454545454, + "Vietnamese,Malay,Chinese": 0.07386363636363637, + "Vietnamese,English,Spanish": 0.06818181818181818, + "Vietnamese,English,Filipino": 0.056818181818181816, + "Vietnamese,English,Chinese": 0.07386363636363637, + "Vietnamese,Spanish,Filipino": 0.10227272727272728, + "Vietnamese,Spanish,Chinese": 0.11931818181818182, + "Vietnamese,Filipino,Chinese": 0.06818181818181818, + "Indonesian,Malay,English": 0.09659090909090909, + "Indonesian,Malay,Spanish": 0.07386363636363637, + "Indonesian,Malay,Filipino": 0.08522727272727272, + "Indonesian,Malay,Chinese": 0.09659090909090909, + "Indonesian,English,Spanish": 0.09659090909090909, + "Indonesian,English,Filipino": 0.07954545454545454, + "Indonesian,English,Chinese": 0.09659090909090909, + "Indonesian,Spanish,Filipino": 0.11363636363636363, + "Indonesian,Spanish,Chinese": 0.11931818181818182, + "Indonesian,Filipino,Chinese": 0.06818181818181818, + "Malay,English,Spanish": 0.07954545454545454, + "Malay,English,Filipino": 0.07954545454545454, + "Malay,English,Chinese": 0.14204545454545456, + "Malay,Spanish,Filipino": 0.10795454545454546, + "Malay,Spanish,Chinese": 0.11931818181818182, + "Malay,Filipino,Chinese": 0.11931818181818182, + "English,Spanish,Filipino": 0.11931818181818182, + "English,Spanish,Chinese": 0.11363636363636363, + "English,Filipino,Chinese": 0.08522727272727272, + "Spanish,Filipino,Chinese": 0.125 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.017045454545454544, + "Vietnamese,Indonesian,Malay,Spanish": 0.017045454545454544, + "Vietnamese,Indonesian,Malay,Filipino": 0.028409090909090908, + "Vietnamese,Indonesian,Malay,Chinese": 0.017045454545454544, + "Vietnamese,Indonesian,English,Spanish": 0.028409090909090908, + "Vietnamese,Indonesian,English,Filipino": 0.022727272727272728, + "Vietnamese,Indonesian,English,Chinese": 0.028409090909090908, + "Vietnamese,Indonesian,Spanish,Filipino": 0.03409090909090909, + "Vietnamese,Indonesian,Spanish,Chinese": 0.017045454545454544, + "Vietnamese,Indonesian,Filipino,Chinese": 0.022727272727272728, + "Vietnamese,Malay,English,Spanish": 0.011363636363636364, + "Vietnamese,Malay,English,Filipino": 0.011363636363636364, + "Vietnamese,Malay,English,Chinese": 0.017045454545454544, + "Vietnamese,Malay,Spanish,Filipino": 0.022727272727272728, + "Vietnamese,Malay,Spanish,Chinese": 0.03409090909090909, + "Vietnamese,Malay,Filipino,Chinese": 0.017045454545454544, + "Vietnamese,English,Spanish,Filipino": 0.03409090909090909, + "Vietnamese,English,Spanish,Chinese": 0.028409090909090908, + "Vietnamese,English,Filipino,Chinese": 0.011363636363636364, + "Vietnamese,Spanish,Filipino,Chinese": 0.03409090909090909, + "Indonesian,Malay,English,Spanish": 0.028409090909090908, + "Indonesian,Malay,English,Filipino": 0.022727272727272728, + "Indonesian,Malay,English,Chinese": 0.03977272727272727, + "Indonesian,Malay,Spanish,Filipino": 0.03977272727272727, + "Indonesian,Malay,Spanish,Chinese": 0.045454545454545456, + "Indonesian,Malay,Filipino,Chinese": 0.028409090909090908, + "Indonesian,English,Spanish,Filipino": 0.045454545454545456, + "Indonesian,English,Spanish,Chinese": 0.05113636363636364, + "Indonesian,English,Filipino,Chinese": 0.028409090909090908, + "Indonesian,Spanish,Filipino,Chinese": 0.03977272727272727, + "Malay,English,Spanish,Filipino": 0.045454545454545456, + "Malay,English,Spanish,Chinese": 0.056818181818181816, + "Malay,English,Filipino,Chinese": 0.056818181818181816, + "Malay,Spanish,Filipino,Chinese": 0.06818181818181818, + "English,Spanish,Filipino,Chinese": 0.056818181818181816 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.011363636363636364, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.011363636363636364, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.011363636363636364, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.011363636363636364, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.011363636363636364, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.005681818181818182, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.017045454545454544, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.011363636363636364, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.005681818181818182, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.005681818181818182, + "Vietnamese,Malay,English,Spanish,Filipino": 0.005681818181818182, + "Vietnamese,Malay,English,Spanish,Chinese": 0.011363636363636364, + "Vietnamese,Malay,English,Filipino,Chinese": 0.005681818181818182, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.011363636363636364, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.011363636363636364, + "Indonesian,Malay,English,Spanish,Filipino": 0.011363636363636364, + "Indonesian,Malay,English,Spanish,Chinese": 0.022727272727272728, + "Indonesian,Malay,English,Filipino,Chinese": 0.011363636363636364, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.022727272727272728, + "Indonesian,English,Spanish,Filipino,Chinese": 0.022727272727272728, + "Malay,English,Spanish,Filipino,Chinese": 0.03977272727272727 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.005681818181818182, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.011363636363636364, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.005681818181818182, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.005681818181818182, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.005681818181818182, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.005681818181818182, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.011363636363636364 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.005681818181818182 + } + }, + "AC3_2": 0.26763212840636386, + "AC3_3": 0.13195656043040851, + "AC3_4": 0.05629704100745069, + "AC3_5": 0.025200128809122774, + "AC3_6": 0.014202024677957328, + "AC3_7": 0.011115056813902522 + } + }, + "sg_eval": { + "prompt_1": { + "accuracy": 0.2524271844660194 + }, + "prompt_2": { + "accuracy": 0.18446601941747573 + }, + "prompt_3": { + "accuracy": 0.24271844660194175 + }, + "prompt_4": { + "accuracy": 0.22330097087378642 + }, + "prompt_5": { + "accuracy": 0.33980582524271846 + } + }, + "cn_eval": { + "prompt_1": { + "accuracy": 0.21904761904761905 + }, + "prompt_2": { + "accuracy": 0.18095238095238095 + }, + "prompt_3": { + "accuracy": 0.2 + }, + "prompt_4": { + "accuracy": 0.22857142857142856 + }, + "prompt_5": { + "accuracy": 0.24761904761904763 + } + }, + "us_eval": { + "prompt_1": { + "accuracy": 0.22429906542056074 + }, + "prompt_2": { + "accuracy": 0.27102803738317754 + }, + "prompt_3": { + "accuracy": 0.14953271028037382 + }, + "prompt_4": { + "accuracy": 0.14953271028037382 + }, + "prompt_5": { + "accuracy": 0.2336448598130841 + } + }, + "ph_eval": { + "prompt_1": { + "accuracy": 0.34, + "category_acc": { + "brand": 0.3, + "demographics": 0.4, + "biology": 0.2, + "history": 0.26666666666666666, + "literature": 0.5, + "politics": 0.8, + "culture": 0.3, + "film": 0.2, + "law": 0.3, + "geography": 0.2 + } + }, + "prompt_2": { + "accuracy": 0.33, + "category_acc": { + "brand": 0.3, + "demographics": 0.2, + "biology": 0.3, + "history": 0.13333333333333333, + "literature": 0.3, + "politics": 0.6, + "culture": 0.3, + "film": 0.5, + "law": 0.3, + "geography": 0.4 + } + }, + "prompt_3": { + "accuracy": 0.33, + "category_acc": { + "brand": 0.4, + "demographics": 0.6, + "biology": 0.2, + "history": 0.26666666666666666, + "literature": 0.4, + "politics": 0.7, + "culture": 0.1, + "film": 0.2, + "law": 0.3, + "geography": 0.3 + } + }, + "prompt_4": { + "accuracy": 0.34, + "category_acc": { + "brand": 0.3, + "demographics": 0.6, + "biology": 0.2, + "history": 0.2, + "literature": 0.5, + "politics": 0.8, + "culture": 0.1, + "film": 0.3, + "law": 0.3, + "geography": 0.3 + } + }, + "prompt_5": { + "accuracy": 0.33, + "category_acc": { + "brand": 0.0, + "demographics": 0.8, + "biology": 0.4, + "history": 0.3333333333333333, + "literature": 0.3, + "politics": 0.3, + "culture": 0.3, + "film": 0.3, + "law": 0.3, + "geography": 0.5 + } + } + }, + "sing2eng": { + "prompt_1": { + "bleu_score": 0.045582081500857526 + }, + "prompt_2": { + "bleu_score": 0.03992758448540713 + }, + "prompt_3": { + "bleu_score": 0.04189605777722635 + }, + "prompt_4": { + "bleu_score": 0.03961742495964162 + }, + "prompt_5": { + "bleu_score": 0.0331185986101603 + } + }, + "flores_ind2eng": { + "prompt_1": { + "bleu_score": 0.012295758955687148 + }, + "prompt_2": { + "bleu_score": 0.027254623395354057 + }, + "prompt_3": { + "bleu_score": 0.023408287560585238 + }, + "prompt_4": { + "bleu_score": 0.014959513603702445 + }, + "prompt_5": { + "bleu_score": 0.008048362232418494 + } + }, + "flores_vie2eng": { + "prompt_1": { + "bleu_score": 0.02536214269191094 + }, + "prompt_2": { + "bleu_score": 0.01489915453686534 + }, + "prompt_3": { + "bleu_score": 0.026511302577627252 + }, + "prompt_4": { + "bleu_score": 0.0314255368257811 + }, + "prompt_5": { + "bleu_score": 0.009915369171731282 + } + }, + "flores_zho2eng": { + "prompt_1": { + "bleu_score": 0.009232091353875814 + }, + "prompt_2": { + "bleu_score": 0.011782292357984833 + }, + "prompt_3": { + "bleu_score": 0.00861814381182431 + }, + "prompt_4": { + "bleu_score": 0.016777357644487248 + }, + "prompt_5": { + "bleu_score": 0.016872138788912104 + } + }, + "flores_zsm2eng": { + "prompt_1": { + "bleu_score": 0.01426890101006039 + }, + "prompt_2": { + "bleu_score": 0.02897043239551227 + }, + "prompt_3": { + "bleu_score": 0.02771466694077164 + }, + "prompt_4": { + "bleu_score": 0.020710729870485292 + }, + "prompt_5": { + "bleu_score": 0.0066293122653927755 + } + }, + "mmlu": { + "prompt_1": { + "accuracy": 0.23337222870478413 + }, + "prompt_2": { + "accuracy": 0.23453908984830804 + }, + "prompt_3": { + "accuracy": 0.2660443407234539 + }, + "prompt_4": { + "accuracy": 0.2637106184364061 + }, + "prompt_5": -1 + }, + "mmlu_full": { + "prompt_1": { + "accuracy": 0.24233106900250267, + "category_acc": { + "high_school_european_history": 0.23170731707317074, + "business_ethics": 0.24242424242424243, + "clinical_knowledge": 0.26136363636363635, + "medical_genetics": 0.2727272727272727, + "high_school_us_history": 0.21674876847290642, + "high_school_physics": 0.25333333333333335, + "high_school_world_history": 0.22033898305084745, + "virology": 0.24848484848484848, + "high_school_microeconomics": 0.24050632911392406, + "econometrics": 0.19469026548672566, + "college_computer_science": 0.21212121212121213, + "high_school_biology": 0.28802588996763756, + "abstract_algebra": 0.2727272727272727, + "professional_accounting": 0.23843416370106763, + "philosophy": 0.23870967741935484, + "professional_medicine": 0.24354243542435425, + "nutrition": 0.24262295081967214, + "global_facts": 0.2222222222222222, + "machine_learning": 0.18018018018018017, + "security_studies": 0.26229508196721313, + "public_relations": 0.22018348623853212, + "professional_psychology": 0.24549918166939444, + "prehistory": 0.22910216718266255, + "anatomy": 0.208955223880597, + "human_sexuality": 0.27692307692307694, + "college_medicine": 0.23837209302325582, + "high_school_government_and_politics": 0.23958333333333334, + "college_chemistry": 0.25252525252525254, + "logical_fallacies": 0.2345679012345679, + "high_school_geography": 0.19796954314720813, + "elementary_mathematics": 0.23607427055702918, + "human_aging": 0.21171171171171171, + "college_mathematics": 0.23232323232323232, + "high_school_psychology": 0.21875, + "formal_logic": 0.272, + "high_school_statistics": 0.24651162790697675, + "international_law": 0.35833333333333334, + "high_school_mathematics": 0.2379182156133829, + "high_school_computer_science": 0.31313131313131315, + "conceptual_physics": 0.24358974358974358, + "miscellaneous": 0.24680306905370844, + "high_school_chemistry": 0.23267326732673269, + "marketing": 0.2532188841201717, + "professional_law": 0.24722765818656228, + "management": 0.19607843137254902, + "college_physics": 0.24752475247524752, + "jurisprudence": 0.2897196261682243, + "world_religions": 0.27058823529411763, + "sociology": 0.24, + "us_foreign_policy": 0.2727272727272727, + "high_school_macroeconomics": 0.2416452442159383, + "computer_security": 0.2828282828282828, + "moral_scenarios": 0.2225950782997763, + "moral_disputes": 0.25217391304347825, + "electrical_engineering": 0.25, + "astronomy": 0.2781456953642384, + "college_biology": 0.21678321678321677 + } + }, + "prompt_2": { + "accuracy": 0.2426170897390061, + "category_acc": { + "high_school_european_history": 0.31097560975609756, + "business_ethics": 0.26262626262626265, + "clinical_knowledge": 0.24621212121212122, + "medical_genetics": 0.3434343434343434, + "high_school_us_history": 0.270935960591133, + "high_school_physics": 0.20666666666666667, + "high_school_world_history": 0.2542372881355932, + "virology": 0.296969696969697, + "high_school_microeconomics": 0.22362869198312235, + "econometrics": 0.24778761061946902, + "college_computer_science": 0.2222222222222222, + "high_school_biology": 0.22330097087378642, + "abstract_algebra": 0.23232323232323232, + "professional_accounting": 0.21352313167259787, + "philosophy": 0.22903225806451613, + "professional_medicine": 0.25830258302583026, + "nutrition": 0.3114754098360656, + "global_facts": 0.24242424242424243, + "machine_learning": 0.26126126126126126, + "security_studies": 0.1680327868852459, + "public_relations": 0.21100917431192662, + "professional_psychology": 0.2635024549918167, + "prehistory": 0.23219814241486067, + "anatomy": 0.17164179104477612, + "human_sexuality": 0.2230769230769231, + "college_medicine": 0.2441860465116279, + "high_school_government_and_politics": 0.24479166666666666, + "college_chemistry": 0.21212121212121213, + "logical_fallacies": 0.2962962962962963, + "high_school_geography": 0.20812182741116753, + "elementary_mathematics": 0.22811671087533156, + "human_aging": 0.23423423423423423, + "college_mathematics": 0.20202020202020202, + "high_school_psychology": 0.20772058823529413, + "formal_logic": 0.312, + "high_school_statistics": 0.17209302325581396, + "international_law": 0.36666666666666664, + "high_school_mathematics": 0.2825278810408922, + "high_school_computer_science": 0.24242424242424243, + "conceptual_physics": 0.24786324786324787, + "miscellaneous": 0.23657289002557544, + "high_school_chemistry": 0.19801980198019803, + "marketing": 0.2532188841201717, + "professional_law": 0.25440313111545987, + "management": 0.23529411764705882, + "college_physics": 0.15841584158415842, + "jurisprudence": 0.29906542056074764, + "world_religions": 0.3058823529411765, + "sociology": 0.29, + "us_foreign_policy": 0.2222222222222222, + "high_school_macroeconomics": 0.1928020565552699, + "computer_security": 0.26262626262626265, + "moral_scenarios": 0.22818791946308725, + "moral_disputes": 0.2608695652173913, + "electrical_engineering": 0.25, + "astronomy": 0.2119205298013245, + "college_biology": 0.25874125874125875 + } + }, + "prompt_3": -1, + "prompt_4": -1, + "prompt_5": -1 + }, + "c_eval": { + "prompt_1": { + "accuracy": 0.2540861812778603 + }, + "prompt_2": { + "accuracy": 0.25037147102526003 + }, + "prompt_3": { + "accuracy": 0.2563150074294205 + }, + "prompt_4": { + "accuracy": 0.25928677563150077 + }, + "prompt_5": -1 + }, + "c_eval_full": { + "prompt_1": { + "accuracy": 0.24968866749688667, + "category_acc": { + "computer_network": 0.20833333333333334, + "operating_system": 0.2916666666666667, + "computer_architecture": 0.2692307692307692, + "college_programming": 0.19047619047619047, + "college_physics": 0.25, + "college_chemistry": 0.2413793103448276, + "advanced_mathematics": 0.20833333333333334, + "probability_and_statistics": 0.30434782608695654, + "discrete_mathematics": 0.09523809523809523, + "electrical_engineer": 0.38095238095238093, + "metrology_engineer": 0.10344827586206896, + "high_school_mathematics": 0.2608695652173913, + "high_school_physics": 0.16666666666666666, + "high_school_chemistry": 0.25, + "high_school_biology": 0.20833333333333334, + "middle_school_mathematics": 0.25, + "middle_school_biology": 0.23076923076923078, + "middle_school_physics": 0.25, + "middle_school_chemistry": 0.2, + "veterinary_medicine": 0.25, + "college_economics": 0.3, + "business_administration": 0.23684210526315788, + "marxism": 0.3333333333333333, + "mao_zedong_thought": 0.41379310344827586, + "education_science": 0.20588235294117646, + "teacher_qualification": 0.16326530612244897, + "high_school_politics": 0.20833333333333334, + "high_school_geography": 0.2916666666666667, + "middle_school_politics": 0.3076923076923077, + "middle_school_geography": 0.23529411764705882, + "modern_chinese_history": 0.17857142857142858, + "ideological_and_moral_cultivation": 0.25, + "logic": 0.1111111111111111, + "law": 0.13793103448275862, + "chinese_language_and_literature": 0.2857142857142857, + "art_studies": 0.34210526315789475, + "professional_tour_guide": 0.38235294117647056, + "legal_professional": 0.17857142857142858, + "high_school_chinese": 0.125, + "high_school_history": 0.44, + "middle_school_history": 0.25925925925925924, + "civil_servant": 0.21153846153846154, + "sports_science": 0.2916666666666667, + "plant_protection": 0.25925925925925924, + "basic_medicine": 0.2916666666666667, + "clinical_medicine": 0.2222222222222222, + "urban_and_rural_planner": 0.3137254901960784, + "accountant": 0.3148148148148148, + "fire_engineer": 0.16666666666666666, + "environmental_impact_assessment_engineer": 0.2777777777777778, + "tax_accountant": 0.24074074074074073, + "physician": 0.24074074074074073 + } + }, + "prompt_2": { + "accuracy": 0.2590286425902864, + "category_acc": { + "computer_network": 0.125, + "operating_system": 0.25, + "computer_architecture": 0.34615384615384615, + "college_programming": 0.2619047619047619, + "college_physics": 0.20833333333333334, + "college_chemistry": 0.3448275862068966, + "advanced_mathematics": 0.20833333333333334, + "probability_and_statistics": 0.2608695652173913, + "discrete_mathematics": 0.09523809523809523, + "electrical_engineer": 0.38095238095238093, + "metrology_engineer": 0.27586206896551724, + "high_school_mathematics": 0.34782608695652173, + "high_school_physics": 0.16666666666666666, + "high_school_chemistry": 0.3333333333333333, + "high_school_biology": 0.2916666666666667, + "middle_school_mathematics": 0.2916666666666667, + "middle_school_biology": 0.2692307692307692, + "middle_school_physics": 0.25, + "middle_school_chemistry": 0.16, + "veterinary_medicine": 0.14285714285714285, + "college_economics": 0.2833333333333333, + "business_administration": 0.15789473684210525, + "marxism": 0.2916666666666667, + "mao_zedong_thought": 0.3793103448275862, + "education_science": 0.23529411764705882, + "teacher_qualification": 0.22448979591836735, + "high_school_politics": 0.20833333333333334, + "high_school_geography": 0.375, + "middle_school_politics": 0.2692307692307692, + "middle_school_geography": 0.29411764705882354, + "modern_chinese_history": 0.17857142857142858, + "ideological_and_moral_cultivation": 0.20833333333333334, + "logic": 0.1111111111111111, + "law": 0.2413793103448276, + "chinese_language_and_literature": 0.32142857142857145, + "art_studies": 0.39473684210526316, + "professional_tour_guide": 0.3235294117647059, + "legal_professional": 0.25, + "high_school_chinese": 0.25, + "high_school_history": 0.4, + "middle_school_history": 0.18518518518518517, + "civil_servant": 0.11538461538461539, + "sports_science": 0.2916666666666667, + "plant_protection": 0.2962962962962963, + "basic_medicine": 0.2916666666666667, + "clinical_medicine": 0.18518518518518517, + "urban_and_rural_planner": 0.35294117647058826, + "accountant": 0.2777777777777778, + "fire_engineer": 0.25, + "environmental_impact_assessment_engineer": 0.2777777777777778, + "tax_accountant": 0.2222222222222222, + "physician": 0.25925925925925924 + } + }, + "prompt_3": { + "accuracy": 0.22291407222914073, + "category_acc": { + "computer_network": 0.08333333333333333, + "operating_system": 0.20833333333333334, + "computer_architecture": 0.38461538461538464, + "college_programming": 0.2619047619047619, + "college_physics": 0.16666666666666666, + "college_chemistry": 0.2413793103448276, + "advanced_mathematics": 0.125, + "probability_and_statistics": 0.21739130434782608, + "discrete_mathematics": 0.2857142857142857, + "electrical_engineer": 0.38095238095238093, + "metrology_engineer": 0.10344827586206896, + "high_school_mathematics": 0.08695652173913043, + "high_school_physics": 0.041666666666666664, + "high_school_chemistry": 0.20833333333333334, + "high_school_biology": 0.2916666666666667, + "middle_school_mathematics": 0.125, + "middle_school_biology": 0.2692307692307692, + "middle_school_physics": 0.2916666666666667, + "middle_school_chemistry": 0.28, + "veterinary_medicine": 0.17857142857142858, + "college_economics": 0.21666666666666667, + "business_administration": 0.07894736842105263, + "marxism": 0.2916666666666667, + "mao_zedong_thought": 0.2413793103448276, + "education_science": 0.14705882352941177, + "teacher_qualification": 0.20408163265306123, + "high_school_politics": 0.16666666666666666, + "high_school_geography": 0.2916666666666667, + "middle_school_politics": 0.34615384615384615, + "middle_school_geography": 0.17647058823529413, + "modern_chinese_history": 0.14285714285714285, + "ideological_and_moral_cultivation": 0.3333333333333333, + "logic": 0.2962962962962963, + "law": 0.13793103448275862, + "chinese_language_and_literature": 0.14285714285714285, + "art_studies": 0.23684210526315788, + "professional_tour_guide": 0.29411764705882354, + "legal_professional": 0.17857142857142858, + "high_school_chinese": 0.20833333333333334, + "high_school_history": 0.32, + "middle_school_history": 0.25925925925925924, + "civil_servant": 0.11538461538461539, + "sports_science": 0.20833333333333334, + "plant_protection": 0.2962962962962963, + "basic_medicine": 0.20833333333333334, + "clinical_medicine": 0.2962962962962963, + "urban_and_rural_planner": 0.17647058823529413, + "accountant": 0.3148148148148148, + "fire_engineer": 0.16666666666666666, + "environmental_impact_assessment_engineer": 0.25, + "tax_accountant": 0.2222222222222222, + "physician": 0.3148148148148148 + } + }, + "prompt_4": { + "accuracy": 0.23848069738480698, + "category_acc": { + "computer_network": 0.4166666666666667, + "operating_system": 0.16666666666666666, + "computer_architecture": 0.15384615384615385, + "college_programming": 0.21428571428571427, + "college_physics": 0.25, + "college_chemistry": 0.27586206896551724, + "advanced_mathematics": 0.375, + "probability_and_statistics": 0.17391304347826086, + "discrete_mathematics": 0.42857142857142855, + "electrical_engineer": 0.21428571428571427, + "metrology_engineer": 0.3448275862068966, + "high_school_mathematics": 0.34782608695652173, + "high_school_physics": 0.125, + "high_school_chemistry": 0.25, + "high_school_biology": 0.20833333333333334, + "middle_school_mathematics": 0.25, + "middle_school_biology": 0.15384615384615385, + "middle_school_physics": 0.16666666666666666, + "middle_school_chemistry": 0.2, + "veterinary_medicine": 0.2857142857142857, + "college_economics": 0.31666666666666665, + "business_administration": 0.21052631578947367, + "marxism": 0.25, + "mao_zedong_thought": 0.1724137931034483, + "education_science": 0.2647058823529412, + "teacher_qualification": 0.20408163265306123, + "high_school_politics": 0.041666666666666664, + "high_school_geography": 0.25, + "middle_school_politics": 0.19230769230769232, + "middle_school_geography": 0.23529411764705882, + "modern_chinese_history": 0.2857142857142857, + "ideological_and_moral_cultivation": 0.08333333333333333, + "logic": 0.25925925925925924, + "law": 0.1724137931034483, + "chinese_language_and_literature": 0.2857142857142857, + "art_studies": 0.3157894736842105, + "professional_tour_guide": 0.23529411764705882, + "legal_professional": 0.25, + "high_school_chinese": 0.125, + "high_school_history": 0.28, + "middle_school_history": 0.25925925925925924, + "civil_servant": 0.21153846153846154, + "sports_science": 0.2916666666666667, + "plant_protection": 0.25925925925925924, + "basic_medicine": 0.375, + "clinical_medicine": 0.2222222222222222, + "urban_and_rural_planner": 0.29411764705882354, + "accountant": 0.24074074074074073, + "fire_engineer": 0.1111111111111111, + "environmental_impact_assessment_engineer": 0.3055555555555556, + "tax_accountant": 0.14814814814814814, + "physician": 0.25925925925925924 + } + }, + "prompt_5": -1 + }, + "cmmlu": { + "prompt_1": { + "accuracy": 0.3154121863799283 + }, + "prompt_2": { + "accuracy": 0.3010752688172043 + }, + "prompt_3": { + "accuracy": 0.2867383512544803 + }, + "prompt_4": { + "accuracy": 0.21863799283154123 + }, + "prompt_5": -1 + }, + "cmmlu_full": { + "prompt_1": { + "accuracy": 0.2628216197547919, + "category_acc": { + "agronomy": 0.27218934911242604, + "anatomy": 0.2972972972972973, + "ancient_chinese": 0.2804878048780488, + "arts": 0.23125, + "astronomy": 0.23636363636363636, + "business_ethics": 0.2679425837320574, + "chinese_civil_service_exam": 0.25625, + "chinese_driving_rule": 0.26717557251908397, + "chinese_food_culture": 0.2647058823529412, + "chinese_foreign_policy": 0.27102803738317754, + "chinese_history": 0.23219814241486067, + "chinese_literature": 0.24019607843137256, + "chinese_teacher_qualification": 0.3128491620111732, + "clinical_knowledge": 0.2869198312236287, + "college_actuarial_science": 0.3018867924528302, + "college_education": 0.27102803738317754, + "college_engineering_hydrology": 0.3113207547169811, + "college_law": 0.2037037037037037, + "college_mathematics": 0.26666666666666666, + "college_medical_statistics": 0.27358490566037735, + "college_medicine": 0.27472527472527475, + "computer_science": 0.28921568627450983, + "computer_security": 0.2573099415204678, + "conceptual_physics": 0.3129251700680272, + "construction_project_management": 0.2517985611510791, + "economics": 0.3018867924528302, + "education": 0.2822085889570552, + "electrical_engineering": 0.3081395348837209, + "elementary_chinese": 0.26587301587301587, + "elementary_commonsense": 0.23232323232323232, + "elementary_information_and_technology": 0.23529411764705882, + "elementary_mathematics": 0.2608695652173913, + "ethnology": 0.26666666666666666, + "food_science": 0.36363636363636365, + "genetics": 0.23295454545454544, + "global_facts": 0.28187919463087246, + "high_school_biology": 0.22485207100591717, + "high_school_chemistry": 0.23484848484848486, + "high_school_geography": 0.211864406779661, + "high_school_mathematics": 0.25609756097560976, + "high_school_physics": 0.2545454545454545, + "high_school_politics": 0.26573426573426573, + "human_sexuality": 0.24603174603174602, + "international_law": 0.2648648648648649, + "journalism": 0.2616279069767442, + "jurisprudence": 0.24817518248175183, + "legal_and_moral_basis": 0.2897196261682243, + "logical": 0.2032520325203252, + "machine_learning": 0.1885245901639344, + "management": 0.2523809523809524, + "marketing": 0.26666666666666666, + "marxist_theory": 0.2698412698412698, + "modern_chinese": 0.27586206896551724, + "nutrition": 0.2827586206896552, + "philosophy": 0.2857142857142857, + "professional_accounting": 0.2857142857142857, + "professional_law": 0.1943127962085308, + "professional_medicine": 0.2579787234042553, + "professional_psychology": 0.23706896551724138, + "public_relations": 0.28160919540229884, + "security_study": 0.2962962962962963, + "sociology": 0.2831858407079646, + "sports_science": 0.2909090909090909, + "traditional_chinese_medicine": 0.22702702702702704, + "virology": 0.3136094674556213, + "world_history": 0.2484472049689441, + "world_religions": 0.21875 + } + }, + "prompt_2": { + "accuracy": 0.2629079606285616, + "category_acc": { + "agronomy": 0.2603550295857988, + "anatomy": 0.2702702702702703, + "ancient_chinese": 0.3048780487804878, + "arts": 0.25, + "astronomy": 0.21818181818181817, + "business_ethics": 0.2583732057416268, + "chinese_civil_service_exam": 0.21875, + "chinese_driving_rule": 0.2748091603053435, + "chinese_food_culture": 0.2426470588235294, + "chinese_foreign_policy": 0.308411214953271, + "chinese_history": 0.2476780185758514, + "chinese_literature": 0.2647058823529412, + "chinese_teacher_qualification": 0.3128491620111732, + "clinical_knowledge": 0.2742616033755274, + "college_actuarial_science": 0.25471698113207547, + "college_education": 0.2803738317757009, + "college_engineering_hydrology": 0.24528301886792453, + "college_law": 0.16666666666666666, + "college_mathematics": 0.26666666666666666, + "college_medical_statistics": 0.22641509433962265, + "college_medicine": 0.28205128205128205, + "computer_science": 0.28921568627450983, + "computer_security": 0.29239766081871343, + "conceptual_physics": 0.35374149659863946, + "construction_project_management": 0.2805755395683453, + "economics": 0.2641509433962264, + "education": 0.25766871165644173, + "electrical_engineering": 0.3023255813953488, + "elementary_chinese": 0.23809523809523808, + "elementary_commonsense": 0.23232323232323232, + "elementary_information_and_technology": 0.21008403361344538, + "elementary_mathematics": 0.2217391304347826, + "ethnology": 0.2518518518518518, + "food_science": 0.3706293706293706, + "genetics": 0.23863636363636365, + "global_facts": 0.26174496644295303, + "high_school_biology": 0.21301775147928995, + "high_school_chemistry": 0.25757575757575757, + "high_school_geography": 0.2796610169491525, + "high_school_mathematics": 0.2621951219512195, + "high_school_physics": 0.2636363636363636, + "high_school_politics": 0.27972027972027974, + "human_sexuality": 0.25396825396825395, + "international_law": 0.2594594594594595, + "journalism": 0.27325581395348836, + "jurisprudence": 0.23357664233576642, + "legal_and_moral_basis": 0.2897196261682243, + "logical": 0.2601626016260163, + "machine_learning": 0.19672131147540983, + "management": 0.23809523809523808, + "marketing": 0.25555555555555554, + "marxist_theory": 0.2962962962962963, + "modern_chinese": 0.27586206896551724, + "nutrition": 0.2620689655172414, + "philosophy": 0.2857142857142857, + "professional_accounting": 0.30857142857142855, + "professional_law": 0.1943127962085308, + "professional_medicine": 0.2579787234042553, + "professional_psychology": 0.25862068965517243, + "public_relations": 0.28735632183908044, + "security_study": 0.31851851851851853, + "sociology": 0.3008849557522124, + "sports_science": 0.2545454545454545, + "traditional_chinese_medicine": 0.23783783783783785, + "virology": 0.3254437869822485, + "world_history": 0.2422360248447205, + "world_religions": 0.29375 + } + }, + "prompt_3": { + "accuracy": 0.2515973061647384, + "category_acc": { + "agronomy": 0.25443786982248523, + "anatomy": 0.24324324324324326, + "ancient_chinese": 0.13414634146341464, + "arts": 0.18125, + "astronomy": 0.24242424242424243, + "business_ethics": 0.2535885167464115, + "chinese_civil_service_exam": 0.275, + "chinese_driving_rule": 0.24427480916030533, + "chinese_food_culture": 0.27941176470588236, + "chinese_foreign_policy": 0.2523364485981308, + "chinese_history": 0.2476780185758514, + "chinese_literature": 0.2647058823529412, + "chinese_teacher_qualification": 0.3128491620111732, + "clinical_knowledge": 0.23628691983122363, + "college_actuarial_science": 0.22641509433962265, + "college_education": 0.2803738317757009, + "college_engineering_hydrology": 0.2169811320754717, + "college_law": 0.2222222222222222, + "college_mathematics": 0.21904761904761905, + "college_medical_statistics": 0.24528301886792453, + "college_medicine": 0.2564102564102564, + "computer_science": 0.22058823529411764, + "computer_security": 0.21052631578947367, + "conceptual_physics": 0.23809523809523808, + "construction_project_management": 0.20863309352517986, + "economics": 0.27044025157232704, + "education": 0.2822085889570552, + "electrical_engineering": 0.21511627906976744, + "elementary_chinese": 0.23412698412698413, + "elementary_commonsense": 0.24242424242424243, + "elementary_information_and_technology": 0.2773109243697479, + "elementary_mathematics": 0.2782608695652174, + "ethnology": 0.24444444444444444, + "food_science": 0.3006993006993007, + "genetics": 0.2727272727272727, + "global_facts": 0.2483221476510067, + "high_school_biology": 0.22485207100591717, + "high_school_chemistry": 0.25, + "high_school_geography": 0.288135593220339, + "high_school_mathematics": 0.2804878048780488, + "high_school_physics": 0.2636363636363636, + "high_school_politics": 0.2517482517482518, + "human_sexuality": 0.2619047619047619, + "international_law": 0.25405405405405407, + "journalism": 0.2558139534883721, + "jurisprudence": 0.22871046228710462, + "legal_and_moral_basis": 0.26635514018691586, + "logical": 0.1951219512195122, + "machine_learning": 0.23770491803278687, + "management": 0.24285714285714285, + "marketing": 0.2722222222222222, + "marxist_theory": 0.2857142857142857, + "modern_chinese": 0.2672413793103448, + "nutrition": 0.27586206896551724, + "philosophy": 0.29523809523809524, + "professional_accounting": 0.3314285714285714, + "professional_law": 0.20853080568720378, + "professional_medicine": 0.27393617021276595, + "professional_psychology": 0.21551724137931033, + "public_relations": 0.25287356321839083, + "security_study": 0.2518518518518518, + "sociology": 0.2610619469026549, + "sports_science": 0.2545454545454545, + "traditional_chinese_medicine": 0.24324324324324326, + "virology": 0.28994082840236685, + "world_history": 0.2670807453416149, + "world_religions": 0.275 + } + }, "prompt_4": -1, "prompt_5": -1 }, + "zbench": { + "prompt_1": { + "accuracy": 0.18181818181818182 + }, + "prompt_2": { + "accuracy": 0.12121212121212122 + }, + "prompt_3": { + "accuracy": 0.24242424242424243 + }, + "prompt_4": { + "accuracy": 0.21212121212121213 + }, + "prompt_5": -1 + }, + "ind_emotion": { + "prompt_1": { + "accuracy": 0.12954545454545455 + }, + "prompt_2": { + "accuracy": 0.125 + }, + "prompt_3": { + "accuracy": 0.1159090909090909 + }, + "prompt_4": { + "accuracy": 0.16590909090909092 + }, + "prompt_5": -1 + }, + "ocnli": { + "prompt_1": { + "accuracy": 0.3142372881355932 + }, + "prompt_2": { + "accuracy": 0.3030508474576271 + }, + "prompt_3": { + "accuracy": 0.3403389830508475 + }, + "prompt_4": { + "accuracy": 0.31559322033898307 + }, + "prompt_5": -1 + }, + "c3": { + "prompt_1": { + "accuracy": 0.27860882572924456 + }, + "prompt_2": { + "accuracy": 0.2857142857142857 + }, + "prompt_3": { + "accuracy": 0.28534031413612565 + }, + "prompt_4": { + "accuracy": 0.2924457741211668 + }, + "prompt_5": -1 + }, + "dream": { + "prompt_1": { + "accuracy": 0.3248407643312102 + }, + "prompt_2": { + "accuracy": 0.3214110730034297 + }, + "prompt_3": { + "accuracy": 0.3248407643312102 + }, + "prompt_4": { + "accuracy": 0.32288094071533563 + }, + "prompt_5": -1 + }, + "samsum": { + "prompt_1": { + "rouge1": 0.10927578644609584, + "rouge2": 0.03267428844189666, + "rougeL": 0.08261344758471074, + "avg_rouge": 0.07485450749090107 + }, + "prompt_2": { + "rouge1": 0.19821357380394702, + "rouge2": 0.05949694497264629, + "rougeL": 0.14784694335746582, + "avg_rouge": 0.13518582071135307 + }, + "prompt_3": { + "rouge1": 0.09414896883478109, + "rouge2": 0.027370283089220106, + "rougeL": 0.0702945967469367, + "avg_rouge": 0.0639379495569793 + }, + "prompt_4": { + "rouge1": 0.014248922879908522, + "rouge2": 0.004032354888082322, + "rougeL": 0.010532953412338281, + "avg_rouge": 0.009604743726776374 + }, + "prompt_5": -1 + }, + "dialogsum": { + "prompt_1": { + "rouge1": 0.2104900397138977, + "rouge2": 0.05827456634560005, + "rougeL": 0.15515524029138283, + "avg_rouge": 0.14130661545029352 + }, + "prompt_2": { + "rouge1": 0.17074081333421356, + "rouge2": 0.04444955559742033, + "rougeL": 0.12665748647069391, + "avg_rouge": 0.11394928513410928 + }, + "prompt_3": { + "rouge1": 0.18877164525277712, + "rouge2": 0.04946218720642109, + "rougeL": 0.1366722666675921, + "avg_rouge": 0.1249686997089301 + }, + "prompt_4": { + "rouge1": 0.18614619283187378, + "rouge2": 0.05124393688371859, + "rougeL": 0.13855193344689934, + "avg_rouge": 0.1253140210541639 + }, + "prompt_5": -1 + }, + "sst2": { + "prompt_1": { + "accuracy": 0.45871559633027525 + }, + "prompt_2": { + "accuracy": 0.49311926605504586 + }, + "prompt_3": { + "accuracy": 0.4873853211009174 + }, + "prompt_4": { + "accuracy": 0.49770642201834864 + }, + "prompt_5": -1 + }, + "cola": { + "prompt_1": { + "accuracy": 0.46596356663470756 + }, + "prompt_2": { + "accuracy": 0.49089165867689355 + }, + "prompt_3": { + "accuracy": 0.4899328859060403 + }, + "prompt_4": { + "accuracy": 0.5215723873441994 + }, + "prompt_5": -1 + }, + "qqp": { + "prompt_1": { + "accuracy": 0.538 + }, + "prompt_2": { + "accuracy": 0.505 + }, + "prompt_3": { + "accuracy": 0.598 + }, + "prompt_4": { + "accuracy": 0.489 + }, + "prompt_5": -1 + }, + "mnli": { + "prompt_1": { + "accuracy": 0.316 + }, + "prompt_2": { + "accuracy": 0.346 + }, + "prompt_3": { + "accuracy": 0.344 + }, + "prompt_4": { + "accuracy": 0.33 + }, + "prompt_5": -1 + }, + "qnli": { + "prompt_1": { + "accuracy": 0.5005 + }, + "prompt_2": { + "accuracy": 0.488 + }, + "prompt_3": { + "accuracy": 0.486 + }, + "prompt_4": { + "accuracy": 0.494 + }, + "prompt_5": -1 + }, + "wnli": { + "prompt_1": { + "accuracy": 0.5070422535211268 + }, + "prompt_2": { + "accuracy": 0.5492957746478874 + }, + "prompt_3": { + "accuracy": 0.5633802816901409 + }, + "prompt_4": { + "accuracy": 0.5352112676056338 + }, + "prompt_5": -1 + }, + "rte": { + "prompt_1": { + "accuracy": 0.49097472924187724 + }, + "prompt_2": { + "accuracy": 0.48014440433212996 + }, + "prompt_3": { + "accuracy": 0.47653429602888087 + }, + "prompt_4": { + "accuracy": 0.5018050541516246 + }, + "prompt_5": -1 + }, + "mrpc": { + "prompt_1": { + "accuracy": 0.49264705882352944 + }, + "prompt_2": { + "accuracy": 0.3799019607843137 + }, + "prompt_3": { + "accuracy": 0.3431372549019608 + }, + "prompt_4": { + "accuracy": 0.4877450980392157 + }, + "prompt_5": -1 + } + }, + "five_shot": { + "cross_mmlu": { + "prompt_1": { + "overall_acc": 0.2533333333333333, + "language_acc": { + "English": 0.24666666666666667, + "Vietnamese": 0.25333333333333335, + "Malay": 0.28, + "Indonesian": 0.26, + "Spanish": 0.24, + "Chinese": 0.22666666666666666, + "Filipino": 0.26666666666666666 + }, + "consistency_score_2": 0.6292063492063492, + "consistency_score_3": 0.4807619047619048, + "consistency_score_4": 0.3803809523809523, + "consistency_score_5": 0.30571428571428577, + "consistency_score_6": 0.2495238095238095, + "consistency_score_7": 0.20666666666666667, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.5933333333333334, + "English,Malay": 0.5266666666666666, + "English,Indonesian": 0.6333333333333333, + "English,Spanish": 0.6133333333333333, + "English,Chinese": 0.64, + "English,Filipino": 0.6533333333333333, + "Vietnamese,Malay": 0.5733333333333334, + "Vietnamese,Indonesian": 0.66, + "Vietnamese,Spanish": 0.6733333333333333, + "Vietnamese,Chinese": 0.64, + "Vietnamese,Filipino": 0.66, + "Malay,Indonesian": 0.5866666666666667, + "Malay,Spanish": 0.62, + "Malay,Chinese": 0.5466666666666666, + "Malay,Filipino": 0.6066666666666667, + "Indonesian,Spanish": 0.62, + "Indonesian,Chinese": 0.68, + "Indonesian,Filipino": 0.6733333333333333, + "Spanish,Chinese": 0.6466666666666666, + "Spanish,Filipino": 0.6733333333333333, + "Chinese,Filipino": 0.6933333333333334 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.3933333333333333, + "English,Vietnamese,Indonesian": 0.48, + "English,Vietnamese,Spanish": 0.48, + "English,Vietnamese,Chinese": 0.49333333333333335, + "English,Vietnamese,Filipino": 0.4866666666666667, + "English,Malay,Indonesian": 0.42, + "English,Malay,Spanish": 0.44, + "English,Malay,Chinese": 0.4066666666666667, + "English,Malay,Filipino": 0.44666666666666666, + "English,Indonesian,Spanish": 0.4666666666666667, + "English,Indonesian,Chinese": 0.5133333333333333, + "English,Indonesian,Filipino": 0.5066666666666667, + "English,Spanish,Chinese": 0.47333333333333333, + "English,Spanish,Filipino": 0.5066666666666667, + "English,Chinese,Filipino": 0.52, + "Vietnamese,Malay,Indonesian": 0.44666666666666666, + "Vietnamese,Malay,Spanish": 0.48, + "Vietnamese,Malay,Chinese": 0.41333333333333333, + "Vietnamese,Malay,Filipino": 0.44666666666666666, + "Vietnamese,Indonesian,Spanish": 0.5066666666666667, + "Vietnamese,Indonesian,Chinese": 0.52, + "Vietnamese,Indonesian,Filipino": 0.52, + "Vietnamese,Spanish,Chinese": 0.5133333333333333, + "Vietnamese,Spanish,Filipino": 0.5466666666666666, + "Vietnamese,Chinese,Filipino": 0.5266666666666666, + "Malay,Indonesian,Spanish": 0.4666666666666667, + "Malay,Indonesian,Chinese": 0.44, + "Malay,Indonesian,Filipino": 0.47333333333333333, + "Malay,Spanish,Chinese": 0.44666666666666666, + "Malay,Spanish,Filipino": 0.49333333333333335, + "Malay,Chinese,Filipino": 0.4666666666666667, + "Indonesian,Spanish,Chinese": 0.4866666666666667, + "Indonesian,Spanish,Filipino": 0.5133333333333333, + "Indonesian,Chinese,Filipino": 0.5466666666666666, + "Spanish,Chinese,Filipino": 0.54 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.32666666666666666, + "English,Vietnamese,Malay,Spanish": 0.3466666666666667, + "English,Vietnamese,Malay,Chinese": 0.31333333333333335, + "English,Vietnamese,Malay,Filipino": 0.3466666666666667, + "English,Vietnamese,Indonesian,Spanish": 0.38, + "English,Vietnamese,Indonesian,Chinese": 0.4066666666666667, + "English,Vietnamese,Indonesian,Filipino": 0.4, + "English,Vietnamese,Spanish,Chinese": 0.3933333333333333, + "English,Vietnamese,Spanish,Filipino": 0.41333333333333333, + "English,Vietnamese,Chinese,Filipino": 0.4066666666666667, + "English,Malay,Indonesian,Spanish": 0.36, + "English,Malay,Indonesian,Chinese": 0.3333333333333333, + "English,Malay,Indonesian,Filipino": 0.36666666666666664, + "English,Malay,Spanish,Chinese": 0.34, + "English,Malay,Spanish,Filipino": 0.38666666666666666, + "English,Malay,Chinese,Filipino": 0.36666666666666664, + "English,Indonesian,Spanish,Chinese": 0.37333333333333335, + "English,Indonesian,Spanish,Filipino": 0.3933333333333333, + "English,Indonesian,Chinese,Filipino": 0.42, + "English,Spanish,Chinese,Filipino": 0.4066666666666667, + "Vietnamese,Malay,Indonesian,Spanish": 0.38, + "Vietnamese,Malay,Indonesian,Chinese": 0.35333333333333333, + "Vietnamese,Malay,Indonesian,Filipino": 0.37333333333333335, + "Vietnamese,Malay,Spanish,Chinese": 0.35333333333333333, + "Vietnamese,Malay,Spanish,Filipino": 0.4, + "Vietnamese,Malay,Chinese,Filipino": 0.36, + "Vietnamese,Indonesian,Spanish,Chinese": 0.4, + "Vietnamese,Indonesian,Spanish,Filipino": 0.42, + "Vietnamese,Indonesian,Chinese,Filipino": 0.4266666666666667, + "Vietnamese,Spanish,Chinese,Filipino": 0.44, + "Malay,Indonesian,Spanish,Chinese": 0.35333333333333333, + "Malay,Indonesian,Spanish,Filipino": 0.3933333333333333, + "Malay,Indonesian,Chinese,Filipino": 0.38, + "Malay,Spanish,Chinese,Filipino": 0.38666666666666666, + "Indonesian,Spanish,Chinese,Filipino": 0.41333333333333333 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.2866666666666667, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.26666666666666666, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.29333333333333333, + "English,Vietnamese,Malay,Spanish,Chinese": 0.2733333333333333, + "English,Vietnamese,Malay,Spanish,Filipino": 0.31333333333333335, + "English,Vietnamese,Malay,Chinese,Filipino": 0.2866666666666667, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.31333333333333335, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.32666666666666666, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.3333333333333333, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.34, + "English,Malay,Indonesian,Spanish,Chinese": 0.2733333333333333, + "English,Malay,Indonesian,Spanish,Filipino": 0.31333333333333335, + "English,Malay,Indonesian,Chinese,Filipino": 0.29333333333333333, + "English,Malay,Spanish,Chinese,Filipino": 0.30666666666666664, + "English,Indonesian,Spanish,Chinese,Filipino": 0.31333333333333335, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.29333333333333333, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.32666666666666666, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.30666666666666664, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.31333333333333335, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.34, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.30666666666666664 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.22666666666666666, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.26, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.24, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.25333333333333335, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.26666666666666666, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.24, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.26 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.20666666666666667 + } + }, + "AC3_2": 0.36122781770487283, + "AC3_3": 0.33181802451802334, + "AC3_4": 0.3041218314318212, + "AC3_5": 0.27706984662846285, + "AC3_6": 0.25141414136414425, + "AC3_7": 0.22763285019206048 + } + }, + "cross_logiqa": { + "prompt_1": -1 + }, + "sg_eval": { + "prompt_1": { + "accuracy": 0.14563106796116504 + } + }, + "cn_eval": { + "prompt_1": { + "accuracy": 0.22857142857142856 + } + }, + "us_eval": { + "prompt_1": { + "accuracy": 0.21495327102803738 + } + }, "ph_eval": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.3, + "category_acc": { + "brand": 0.3, + "demographics": 0.4, + "biology": 0.3, + "history": 0.26666666666666666, + "literature": 0.3, + "politics": 0.5, + "culture": 0.2, + "film": 0.4, + "law": 0.1, + "geography": 0.3 + } + } }, "sing2eng": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "bleu_score": 0.01012215205693314 + } }, "flores_ind2eng": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "bleu_score": 0.019360186688278762 + } }, "flores_vie2eng": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "bleu_score": 0.02686115849874378 + } }, "flores_zho2eng": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "bleu_score": 0.012846813544731085 + } }, "flores_zsm2eng": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "bleu_score": 0.021966847435259312 + } }, "mmlu": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.2660443407234539 + } }, "mmlu_full": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": -1 }, "c_eval": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.24442793462109955 + } }, "c_eval_full": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.22851805728518057, + "category_acc": { + "computer_network": 0.20833333333333334, + "operating_system": 0.20833333333333334, + "computer_architecture": 0.34615384615384615, + "college_programming": 0.23809523809523808, + "college_physics": 0.20833333333333334, + "college_chemistry": 0.06896551724137931, + "advanced_mathematics": 0.20833333333333334, + "probability_and_statistics": 0.17391304347826086, + "discrete_mathematics": 0.42857142857142855, + "electrical_engineer": 0.2857142857142857, + "metrology_engineer": 0.10344827586206896, + "high_school_mathematics": 0.34782608695652173, + "high_school_physics": 0.25, + "high_school_chemistry": 0.16666666666666666, + "high_school_biology": 0.2916666666666667, + "middle_school_mathematics": 0.16666666666666666, + "middle_school_biology": 0.19230769230769232, + "middle_school_physics": 0.16666666666666666, + "middle_school_chemistry": 0.08, + "veterinary_medicine": 0.21428571428571427, + "college_economics": 0.25, + "business_administration": 0.23684210526315788, + "marxism": 0.25, + "mao_zedong_thought": 0.2413793103448276, + "education_science": 0.17647058823529413, + "teacher_qualification": 0.2653061224489796, + "high_school_politics": 0.25, + "high_school_geography": 0.16666666666666666, + "middle_school_politics": 0.3076923076923077, + "middle_school_geography": 0.11764705882352941, + "modern_chinese_history": 0.21428571428571427, + "ideological_and_moral_cultivation": 0.2916666666666667, + "logic": 0.14814814814814814, + "law": 0.20689655172413793, + "chinese_language_and_literature": 0.17857142857142858, + "art_studies": 0.42105263157894735, + "professional_tour_guide": 0.2647058823529412, + "legal_professional": 0.14285714285714285, + "high_school_chinese": 0.20833333333333334, + "high_school_history": 0.28, + "middle_school_history": 0.18518518518518517, + "civil_servant": 0.23076923076923078, + "sports_science": 0.25, + "plant_protection": 0.3333333333333333, + "basic_medicine": 0.08333333333333333, + "clinical_medicine": 0.18518518518518517, + "urban_and_rural_planner": 0.2549019607843137, + "accountant": 0.24074074074074073, + "fire_engineer": 0.2777777777777778, + "environmental_impact_assessment_engineer": 0.19444444444444445, + "tax_accountant": 0.18518518518518517, + "physician": 0.2777777777777778 + } + } }, "cmmlu": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.2867383512544803 + } }, "cmmlu_full": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": -1 }, "zbench": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.12121212121212122 + } }, "ind_emotion": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.15 + } }, "ocnli": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.3311864406779661 + } }, "c3": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": -1 }, "dream": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": -1 }, "samsum": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": -1 }, "dialogsum": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": -1 }, "sst2": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.5057339449541285 + } }, "cola": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.3969319271332694 + } }, "qqp": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.545 + } }, "mnli": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.3405 + } }, "qnli": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.4935 + } }, "wnli": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.5352112676056338 + } }, "rte": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.49458483754512633 + } + }, + "mrpc": { + "prompt_1": { + "accuracy": 0.4166666666666667 + } + } + } + }, + "phi-2": { + "model_size": "2.7B", + "model_link": "https://huggingface.co/microsoft/phi-2", + "zero_shot": { + "cross_mmlu": { + "prompt_1": { + "overall_acc": 0.3009523809523809, + "language_acc": { + "English": 0.36, + "Vietnamese": 0.29333333333333333, + "Malay": 0.25333333333333335, + "Indonesian": 0.26, + "Spanish": 0.31333333333333335, + "Chinese": 0.32, + "Filipino": 0.30666666666666664 + }, + "consistency_score_2": 0.5882539682539683, + "consistency_score_3": 0.42228571428571426, + "consistency_score_4": 0.31942857142857145, + "consistency_score_5": 0.24698412698412694, + "consistency_score_6": 0.19333333333333336, + "consistency_score_7": 0.15333333333333332, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.5266666666666666, + "English,Malay": 0.6733333333333333, + "English,Indonesian": 0.6666666666666666, + "English,Spanish": 0.6, + "English,Chinese": 0.43333333333333335, + "English,Filipino": 0.6, + "Vietnamese,Malay": 0.58, + "Vietnamese,Indonesian": 0.6066666666666667, + "Vietnamese,Spanish": 0.54, + "Vietnamese,Chinese": 0.44666666666666666, + "Vietnamese,Filipino": 0.5733333333333334, + "Malay,Indonesian": 0.8733333333333333, + "Malay,Spanish": 0.7133333333333334, + "Malay,Chinese": 0.41333333333333333, + "Malay,Filipino": 0.7333333333333333, + "Indonesian,Spanish": 0.7333333333333333, + "Indonesian,Chinese": 0.44666666666666666, + "Indonesian,Filipino": 0.7266666666666667, + "Spanish,Chinese": 0.46, + "Spanish,Filipino": 0.6066666666666667, + "Chinese,Filipino": 0.4 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.43333333333333335, + "English,Vietnamese,Indonesian": 0.44, + "English,Vietnamese,Spanish": 0.38, + "English,Vietnamese,Chinese": 0.26666666666666666, + "English,Vietnamese,Filipino": 0.4, + "English,Malay,Indonesian": 0.62, + "English,Malay,Spanish": 0.5266666666666666, + "English,Malay,Chinese": 0.30666666666666664, + "English,Malay,Filipino": 0.5333333333333333, + "English,Indonesian,Spanish": 0.5333333333333333, + "English,Indonesian,Chinese": 0.32666666666666666, + "English,Indonesian,Filipino": 0.52, + "English,Spanish,Chinese": 0.32, + "English,Spanish,Filipino": 0.44666666666666666, + "English,Chinese,Filipino": 0.28, + "Vietnamese,Malay,Indonesian": 0.54, + "Vietnamese,Malay,Spanish": 0.44666666666666666, + "Vietnamese,Malay,Chinese": 0.2733333333333333, + "Vietnamese,Malay,Filipino": 0.46, + "Vietnamese,Indonesian,Spanish": 0.47333333333333333, + "Vietnamese,Indonesian,Chinese": 0.30666666666666664, + "Vietnamese,Indonesian,Filipino": 0.47333333333333333, + "Vietnamese,Spanish,Chinese": 0.29333333333333333, + "Vietnamese,Spanish,Filipino": 0.4, + "Vietnamese,Chinese,Filipino": 0.2733333333333333, + "Malay,Indonesian,Spanish": 0.6733333333333333, + "Malay,Indonesian,Chinese": 0.38666666666666666, + "Malay,Indonesian,Filipino": 0.6666666666666666, + "Malay,Spanish,Chinese": 0.3466666666666667, + "Malay,Spanish,Filipino": 0.5466666666666666, + "Malay,Chinese,Filipino": 0.32, + "Indonesian,Spanish,Chinese": 0.36666666666666664, + "Indonesian,Spanish,Filipino": 0.56, + "Indonesian,Chinese,Filipino": 0.34, + "Spanish,Chinese,Filipino": 0.3 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.4066666666666667, + "English,Vietnamese,Malay,Spanish": 0.3466666666666667, + "English,Vietnamese,Malay,Chinese": 0.20666666666666667, + "English,Vietnamese,Malay,Filipino": 0.36, + "English,Vietnamese,Indonesian,Spanish": 0.35333333333333333, + "English,Vietnamese,Indonesian,Chinese": 0.22, + "English,Vietnamese,Indonesian,Filipino": 0.35333333333333333, + "English,Vietnamese,Spanish,Chinese": 0.20666666666666667, + "English,Vietnamese,Spanish,Filipino": 0.29333333333333333, + "English,Vietnamese,Chinese,Filipino": 0.19333333333333333, + "English,Malay,Indonesian,Spanish": 0.5066666666666667, + "English,Malay,Indonesian,Chinese": 0.3, + "English,Malay,Indonesian,Filipino": 0.5, + "English,Malay,Spanish,Chinese": 0.2733333333333333, + "English,Malay,Spanish,Filipino": 0.4266666666666667, + "English,Malay,Chinese,Filipino": 0.25333333333333335, + "English,Indonesian,Spanish,Chinese": 0.28, + "English,Indonesian,Spanish,Filipino": 0.42, + "English,Indonesian,Chinese,Filipino": 0.25333333333333335, + "English,Spanish,Chinese,Filipino": 0.24, + "Vietnamese,Malay,Indonesian,Spanish": 0.4266666666666667, + "Vietnamese,Malay,Indonesian,Chinese": 0.26666666666666666, + "Vietnamese,Malay,Indonesian,Filipino": 0.4266666666666667, + "Vietnamese,Malay,Spanish,Chinese": 0.24666666666666667, + "Vietnamese,Malay,Spanish,Filipino": 0.35333333333333333, + "Vietnamese,Malay,Chinese,Filipino": 0.22666666666666666, + "Vietnamese,Indonesian,Spanish,Chinese": 0.2733333333333333, + "Vietnamese,Indonesian,Spanish,Filipino": 0.38, + "Vietnamese,Indonesian,Chinese,Filipino": 0.24666666666666667, + "Vietnamese,Spanish,Chinese,Filipino": 0.22666666666666666, + "Malay,Indonesian,Spanish,Chinese": 0.3333333333333333, + "Malay,Indonesian,Spanish,Filipino": 0.52, + "Malay,Indonesian,Chinese,Filipino": 0.30666666666666664, + "Malay,Spanish,Chinese,Filipino": 0.26666666666666666, + "Indonesian,Spanish,Chinese,Filipino": 0.2866666666666667 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.3333333333333333, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.2, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.3333333333333333, + "English,Vietnamese,Malay,Spanish,Chinese": 0.19333333333333333, + "English,Vietnamese,Malay,Spanish,Filipino": 0.28, + "English,Vietnamese,Malay,Chinese,Filipino": 0.17333333333333334, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.2, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.28, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.17333333333333334, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.16666666666666666, + "English,Malay,Indonesian,Spanish,Chinese": 0.26666666666666666, + "English,Malay,Indonesian,Spanish,Filipino": 0.4066666666666667, + "English,Malay,Indonesian,Chinese,Filipino": 0.24666666666666667, + "English,Malay,Spanish,Chinese,Filipino": 0.22666666666666666, + "English,Indonesian,Spanish,Chinese,Filipino": 0.22666666666666666, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.24, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.34, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.22, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.2, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.22, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.26 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.18666666666666668, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.26666666666666666, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.16666666666666666, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.16, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.16, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.22, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.19333333333333333 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.15333333333333332 + } + }, + "AC3_2": 0.39818976211499246, + "AC3_3": 0.35144136343385574, + "AC3_4": 0.30991534711439367, + "AC3_5": 0.2713104893838018, + "AC3_6": 0.23542710335635225, + "AC3_7": 0.20315863028372116 + }, + "prompt_2": { + "overall_acc": 0.31047619047619046, + "language_acc": { + "English": 0.38666666666666666, + "Vietnamese": 0.2866666666666667, + "Malay": 0.2733333333333333, + "Indonesian": 0.26666666666666666, + "Spanish": 0.31333333333333335, + "Chinese": 0.35333333333333333, + "Filipino": 0.29333333333333333 + }, + "consistency_score_2": 0.5819047619047619, + "consistency_score_3": 0.41942857142857154, + "consistency_score_4": 0.3219047619047619, + "consistency_score_5": 0.2539682539682539, + "consistency_score_6": 0.20380952380952383, + "consistency_score_7": 0.16666666666666666, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.44, + "English,Malay": 0.5933333333333334, + "English,Indonesian": 0.56, + "English,Spanish": 0.56, + "English,Chinese": 0.42, + "English,Filipino": 0.54, + "Vietnamese,Malay": 0.64, + "Vietnamese,Indonesian": 0.62, + "Vietnamese,Spanish": 0.5466666666666666, + "Vietnamese,Chinese": 0.44, + "Vietnamese,Filipino": 0.6, + "Malay,Indonesian": 0.8466666666666667, + "Malay,Spanish": 0.74, + "Malay,Chinese": 0.44666666666666666, + "Malay,Filipino": 0.7266666666666667, + "Indonesian,Spanish": 0.7466666666666667, + "Indonesian,Chinese": 0.49333333333333335, + "Indonesian,Filipino": 0.7133333333333334, + "Spanish,Chinese": 0.48, + "Spanish,Filipino": 0.62, + "Chinese,Filipino": 0.44666666666666666 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.36666666666666664, + "English,Vietnamese,Indonesian": 0.35333333333333333, + "English,Vietnamese,Spanish": 0.3333333333333333, + "English,Vietnamese,Chinese": 0.25333333333333335, + "English,Vietnamese,Filipino": 0.32666666666666666, + "English,Malay,Indonesian": 0.52, + "English,Malay,Spanish": 0.48, + "English,Malay,Chinese": 0.2866666666666667, + "English,Malay,Filipino": 0.46, + "English,Indonesian,Spanish": 0.47333333333333333, + "English,Indonesian,Chinese": 0.3, + "English,Indonesian,Filipino": 0.44, + "English,Spanish,Chinese": 0.3, + "English,Spanish,Filipino": 0.4, + "English,Chinese,Filipino": 0.2733333333333333, + "Vietnamese,Malay,Indonesian": 0.5666666666666667, + "Vietnamese,Malay,Spanish": 0.49333333333333335, + "Vietnamese,Malay,Chinese": 0.3333333333333333, + "Vietnamese,Malay,Filipino": 0.5066666666666667, + "Vietnamese,Indonesian,Spanish": 0.4866666666666667, + "Vietnamese,Indonesian,Chinese": 0.34, + "Vietnamese,Indonesian,Filipino": 0.49333333333333335, + "Vietnamese,Spanish,Chinese": 0.32, + "Vietnamese,Spanish,Filipino": 0.42, + "Vietnamese,Chinese,Filipino": 0.32, + "Malay,Indonesian,Spanish": 0.6933333333333334, + "Malay,Indonesian,Chinese": 0.42, + "Malay,Indonesian,Filipino": 0.66, + "Malay,Spanish,Chinese": 0.4, + "Malay,Spanish,Filipino": 0.5733333333333334, + "Malay,Chinese,Filipino": 0.36, + "Indonesian,Spanish,Chinese": 0.41333333333333333, + "Indonesian,Spanish,Filipino": 0.58, + "Indonesian,Chinese,Filipino": 0.38666666666666666, + "Spanish,Chinese,Filipino": 0.3466666666666667 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.3333333333333333, + "English,Vietnamese,Malay,Spanish": 0.31333333333333335, + "English,Vietnamese,Malay,Chinese": 0.22, + "English,Vietnamese,Malay,Filipino": 0.29333333333333333, + "English,Vietnamese,Indonesian,Spanish": 0.31333333333333335, + "English,Vietnamese,Indonesian,Chinese": 0.21333333333333335, + "English,Vietnamese,Indonesian,Filipino": 0.2866666666666667, + "English,Vietnamese,Spanish,Chinese": 0.20666666666666667, + "English,Vietnamese,Spanish,Filipino": 0.26666666666666666, + "English,Vietnamese,Chinese,Filipino": 0.2, + "English,Malay,Indonesian,Spanish": 0.44666666666666666, + "English,Malay,Indonesian,Chinese": 0.2733333333333333, + "English,Malay,Indonesian,Filipino": 0.4266666666666667, + "English,Malay,Spanish,Chinese": 0.26, + "English,Malay,Spanish,Filipino": 0.37333333333333335, + "English,Malay,Chinese,Filipino": 0.24666666666666667, + "English,Indonesian,Spanish,Chinese": 0.26666666666666666, + "English,Indonesian,Spanish,Filipino": 0.37333333333333335, + "English,Indonesian,Chinese,Filipino": 0.24666666666666667, + "English,Spanish,Chinese,Filipino": 0.23333333333333334, + "Vietnamese,Malay,Indonesian,Spanish": 0.47333333333333333, + "Vietnamese,Malay,Indonesian,Chinese": 0.32, + "Vietnamese,Malay,Indonesian,Filipino": 0.4666666666666667, + "Vietnamese,Malay,Spanish,Chinese": 0.30666666666666664, + "Vietnamese,Malay,Spanish,Filipino": 0.4066666666666667, + "Vietnamese,Malay,Chinese,Filipino": 0.28, + "Vietnamese,Indonesian,Spanish,Chinese": 0.3, + "Vietnamese,Indonesian,Spanish,Filipino": 0.4066666666666667, + "Vietnamese,Indonesian,Chinese,Filipino": 0.2866666666666667, + "Vietnamese,Spanish,Chinese,Filipino": 0.26, + "Malay,Indonesian,Spanish,Chinese": 0.38666666666666666, + "Malay,Indonesian,Spanish,Filipino": 0.56, + "Malay,Indonesian,Chinese,Filipino": 0.35333333333333333, + "Malay,Spanish,Chinese,Filipino": 0.32666666666666666, + "Indonesian,Spanish,Chinese,Filipino": 0.34 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.3, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.20666666666666667, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.28, + "English,Vietnamese,Malay,Spanish,Chinese": 0.2, + "English,Vietnamese,Malay,Spanish,Filipino": 0.25333333333333335, + "English,Vietnamese,Malay,Chinese,Filipino": 0.18666666666666668, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.19333333333333333, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.25333333333333335, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.18, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.17333333333333334, + "English,Malay,Indonesian,Spanish,Chinese": 0.25333333333333335, + "English,Malay,Indonesian,Spanish,Filipino": 0.36, + "English,Malay,Indonesian,Chinese,Filipino": 0.24, + "English,Malay,Spanish,Chinese,Filipino": 0.22, + "English,Indonesian,Spanish,Chinese,Filipino": 0.22666666666666666, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.3, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.4, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.2733333333333333, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.25333333333333335, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.25333333333333335, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.32666666666666666 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.19333333333333333, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.24666666666666667, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.18, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.16666666666666666, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.16666666666666666, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.22, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.25333333333333335 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.16666666666666666 + } + }, + "AC3_2": 0.40491131773680794, + "AC3_3": 0.3568207574816837, + "AC3_4": 0.31608720591674033, + "AC3_5": 0.27939364717316967, + "AC3_6": 0.24608112869994633, + "AC3_7": 0.21689953421934574 + }, + "prompt_3": { + "overall_acc": 0.29904761904761906, + "language_acc": { + "English": 0.38666666666666666, + "Vietnamese": 0.26666666666666666, + "Malay": 0.26666666666666666, + "Indonesian": 0.26, + "Spanish": 0.2866666666666667, + "Chinese": 0.32, + "Filipino": 0.30666666666666664 + }, + "consistency_score_2": 0.6123809523809525, + "consistency_score_3": 0.46095238095238095, + "consistency_score_4": 0.36933333333333335, + "consistency_score_5": 0.30571428571428566, + "consistency_score_6": 0.26, + "consistency_score_7": 0.22666666666666666, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.4866666666666667, + "English,Malay": 0.62, + "English,Indonesian": 0.6133333333333333, + "English,Spanish": 0.6066666666666667, + "English,Chinese": 0.44666666666666666, + "English,Filipino": 0.54, + "Vietnamese,Malay": 0.6933333333333334, + "Vietnamese,Indonesian": 0.6666666666666666, + "Vietnamese,Spanish": 0.6266666666666667, + "Vietnamese,Chinese": 0.4866666666666667, + "Vietnamese,Filipino": 0.6, + "Malay,Indonesian": 0.9, + "Malay,Spanish": 0.76, + "Malay,Chinese": 0.4533333333333333, + "Malay,Filipino": 0.7466666666666667, + "Indonesian,Spanish": 0.7933333333333333, + "Indonesian,Chinese": 0.4866666666666667, + "Indonesian,Filipino": 0.7066666666666667, + "Spanish,Chinese": 0.5266666666666666, + "Spanish,Filipino": 0.64, + "Chinese,Filipino": 0.46 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.43333333333333335, + "English,Vietnamese,Indonesian": 0.42, + "English,Vietnamese,Spanish": 0.42, + "English,Vietnamese,Chinese": 0.29333333333333333, + "English,Vietnamese,Filipino": 0.36, + "English,Malay,Indonesian": 0.5866666666666667, + "English,Malay,Spanish": 0.5266666666666666, + "English,Malay,Chinese": 0.3333333333333333, + "English,Malay,Filipino": 0.47333333333333333, + "English,Indonesian,Spanish": 0.5333333333333333, + "English,Indonesian,Chinese": 0.34, + "English,Indonesian,Filipino": 0.46, + "English,Spanish,Chinese": 0.34, + "English,Spanish,Filipino": 0.44, + "English,Chinese,Filipino": 0.31333333333333335, + "Vietnamese,Malay,Indonesian": 0.6333333333333333, + "Vietnamese,Malay,Spanish": 0.5666666666666667, + "Vietnamese,Malay,Chinese": 0.36666666666666664, + "Vietnamese,Malay,Filipino": 0.5466666666666666, + "Vietnamese,Indonesian,Spanish": 0.58, + "Vietnamese,Indonesian,Chinese": 0.37333333333333335, + "Vietnamese,Indonesian,Filipino": 0.52, + "Vietnamese,Spanish,Chinese": 0.38, + "Vietnamese,Spanish,Filipino": 0.4866666666666667, + "Vietnamese,Chinese,Filipino": 0.35333333333333333, + "Malay,Indonesian,Spanish": 0.74, + "Malay,Indonesian,Chinese": 0.44, + "Malay,Indonesian,Filipino": 0.68, + "Malay,Spanish,Chinese": 0.4066666666666667, + "Malay,Spanish,Filipino": 0.6, + "Malay,Chinese,Filipino": 0.38, + "Indonesian,Spanish,Chinese": 0.43333333333333335, + "Indonesian,Spanish,Filipino": 0.6066666666666667, + "Indonesian,Chinese,Filipino": 0.3933333333333333, + "Spanish,Chinese,Filipino": 0.37333333333333335 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.41333333333333333, + "English,Vietnamese,Malay,Spanish": 0.4, + "English,Vietnamese,Malay,Chinese": 0.2733333333333333, + "English,Vietnamese,Malay,Filipino": 0.3466666666666667, + "English,Vietnamese,Indonesian,Spanish": 0.4, + "English,Vietnamese,Indonesian,Chinese": 0.26666666666666666, + "English,Vietnamese,Indonesian,Filipino": 0.3333333333333333, + "English,Vietnamese,Spanish,Chinese": 0.28, + "English,Vietnamese,Spanish,Filipino": 0.34, + "English,Vietnamese,Chinese,Filipino": 0.24666666666666667, + "English,Malay,Indonesian,Spanish": 0.5133333333333333, + "English,Malay,Indonesian,Chinese": 0.32, + "English,Malay,Indonesian,Filipino": 0.44666666666666666, + "English,Malay,Spanish,Chinese": 0.30666666666666664, + "English,Malay,Spanish,Filipino": 0.41333333333333333, + "English,Malay,Chinese,Filipino": 0.28, + "English,Indonesian,Spanish,Chinese": 0.31333333333333335, + "English,Indonesian,Spanish,Filipino": 0.41333333333333333, + "English,Indonesian,Chinese,Filipino": 0.28, + "English,Spanish,Chinese,Filipino": 0.28, + "Vietnamese,Malay,Indonesian,Spanish": 0.56, + "Vietnamese,Malay,Indonesian,Chinese": 0.35333333333333333, + "Vietnamese,Malay,Indonesian,Filipino": 0.5, + "Vietnamese,Malay,Spanish,Chinese": 0.34, + "Vietnamese,Malay,Spanish,Filipino": 0.4666666666666667, + "Vietnamese,Malay,Chinese,Filipino": 0.32, + "Vietnamese,Indonesian,Spanish,Chinese": 0.35333333333333333, + "Vietnamese,Indonesian,Spanish,Filipino": 0.48, + "Vietnamese,Indonesian,Chinese,Filipino": 0.32666666666666666, + "Vietnamese,Spanish,Chinese,Filipino": 0.31333333333333335, + "Malay,Indonesian,Spanish,Chinese": 0.4, + "Malay,Indonesian,Spanish,Filipino": 0.58, + "Malay,Indonesian,Chinese,Filipino": 0.36666666666666664, + "Malay,Spanish,Chinese,Filipino": 0.34, + "Indonesian,Spanish,Chinese,Filipino": 0.36 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.3933333333333333, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.26, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.32666666666666666, + "English,Vietnamese,Malay,Spanish,Chinese": 0.26666666666666666, + "English,Vietnamese,Malay,Spanish,Filipino": 0.3333333333333333, + "English,Vietnamese,Malay,Chinese,Filipino": 0.24, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.26666666666666666, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.3333333333333333, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.23333333333333334, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.24, + "English,Malay,Indonesian,Spanish,Chinese": 0.3, + "English,Malay,Indonesian,Spanish,Filipino": 0.4, + "English,Malay,Indonesian,Chinese,Filipino": 0.26666666666666666, + "English,Malay,Spanish,Chinese,Filipino": 0.26, + "English,Indonesian,Spanish,Chinese,Filipino": 0.26666666666666666, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.3333333333333333, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.46, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.30666666666666664, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.29333333333333333, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.30666666666666664, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.3333333333333333 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.26, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.32666666666666666, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.22666666666666666, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.23333333333333334, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.23333333333333334, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.25333333333333335, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.2866666666666667 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.22666666666666666 + } + }, + "AC3_2": 0.40185500319022294, + "AC3_3": 0.3627545052631654, + "AC3_4": 0.33049491778089396, + "AC3_5": 0.3023442069241343, + "AC3_6": 0.278160136236445, + "AC3_7": 0.25787439608621354 + }, + "prompt_4": { + "overall_acc": 0.3152380952380952, + "language_acc": { + "English": 0.44, + "Vietnamese": 0.30666666666666664, + "Malay": 0.26, + "Indonesian": 0.26666666666666666, + "Spanish": 0.30666666666666664, + "Chinese": 0.3466666666666667, + "Filipino": 0.28 + }, + "consistency_score_2": 0.5669841269841269, + "consistency_score_3": 0.3958095238095237, + "consistency_score_4": 0.29561904761904767, + "consistency_score_5": 0.22920634920634922, + "consistency_score_6": 0.18380952380952378, + "consistency_score_7": 0.15333333333333332, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.4866666666666667, + "English,Malay": 0.6266666666666667, + "English,Indonesian": 0.58, + "English,Spanish": 0.58, + "English,Chinese": 0.4066666666666667, + "English,Filipino": 0.5666666666666667, + "Vietnamese,Malay": 0.5933333333333334, + "Vietnamese,Indonesian": 0.54, + "Vietnamese,Spanish": 0.5466666666666666, + "Vietnamese,Chinese": 0.4266666666666667, + "Vietnamese,Filipino": 0.58, + "Malay,Indonesian": 0.8666666666666667, + "Malay,Spanish": 0.7333333333333333, + "Malay,Chinese": 0.36, + "Malay,Filipino": 0.7466666666666667, + "Indonesian,Spanish": 0.74, + "Indonesian,Chinese": 0.4, + "Indonesian,Filipino": 0.7, + "Spanish,Chinese": 0.42, + "Spanish,Filipino": 0.6, + "Chinese,Filipino": 0.4066666666666667 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.38666666666666666, + "English,Vietnamese,Indonesian": 0.35333333333333333, + "English,Vietnamese,Spanish": 0.36, + "English,Vietnamese,Chinese": 0.24666666666666667, + "English,Vietnamese,Filipino": 0.36, + "English,Malay,Indonesian": 0.56, + "English,Malay,Spanish": 0.5066666666666667, + "English,Malay,Chinese": 0.26, + "English,Malay,Filipino": 0.5, + "English,Indonesian,Spanish": 0.49333333333333335, + "English,Indonesian,Chinese": 0.26666666666666666, + "English,Indonesian,Filipino": 0.4533333333333333, + "English,Spanish,Chinese": 0.28, + "English,Spanish,Filipino": 0.42, + "English,Chinese,Filipino": 0.25333333333333335, + "Vietnamese,Malay,Indonesian": 0.5066666666666667, + "Vietnamese,Malay,Spanish": 0.4666666666666667, + "Vietnamese,Malay,Chinese": 0.26, + "Vietnamese,Malay,Filipino": 0.48, + "Vietnamese,Indonesian,Spanish": 0.46, + "Vietnamese,Indonesian,Chinese": 0.26666666666666666, + "Vietnamese,Indonesian,Filipino": 0.4266666666666667, + "Vietnamese,Spanish,Chinese": 0.26666666666666666, + "Vietnamese,Spanish,Filipino": 0.3933333333333333, + "Vietnamese,Chinese,Filipino": 0.2733333333333333, + "Malay,Indonesian,Spanish": 0.6933333333333334, + "Malay,Indonesian,Chinese": 0.34, + "Malay,Indonesian,Filipino": 0.66, + "Malay,Spanish,Chinese": 0.31333333333333335, + "Malay,Spanish,Filipino": 0.5666666666666667, + "Malay,Chinese,Filipino": 0.3, + "Indonesian,Spanish,Chinese": 0.3333333333333333, + "Indonesian,Spanish,Filipino": 0.5533333333333333, + "Indonesian,Chinese,Filipino": 0.31333333333333335, + "Spanish,Chinese,Filipino": 0.28 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.34, + "English,Vietnamese,Malay,Spanish": 0.3333333333333333, + "English,Vietnamese,Malay,Chinese": 0.19333333333333333, + "English,Vietnamese,Malay,Filipino": 0.32666666666666666, + "English,Vietnamese,Indonesian,Spanish": 0.32666666666666666, + "English,Vietnamese,Indonesian,Chinese": 0.19333333333333333, + "English,Vietnamese,Indonesian,Filipino": 0.29333333333333333, + "English,Vietnamese,Spanish,Chinese": 0.2, + "English,Vietnamese,Spanish,Filipino": 0.29333333333333333, + "English,Vietnamese,Chinese,Filipino": 0.19333333333333333, + "English,Malay,Indonesian,Spanish": 0.48, + "English,Malay,Indonesian,Chinese": 0.24666666666666667, + "English,Malay,Indonesian,Filipino": 0.44, + "English,Malay,Spanish,Chinese": 0.23333333333333334, + "English,Malay,Spanish,Filipino": 0.4, + "English,Malay,Chinese,Filipino": 0.22, + "English,Indonesian,Spanish,Chinese": 0.24666666666666667, + "English,Indonesian,Spanish,Filipino": 0.38666666666666666, + "English,Indonesian,Chinese,Filipino": 0.22, + "English,Spanish,Chinese,Filipino": 0.20666666666666667, + "Vietnamese,Malay,Indonesian,Spanish": 0.43333333333333335, + "Vietnamese,Malay,Indonesian,Chinese": 0.24, + "Vietnamese,Malay,Indonesian,Filipino": 0.4066666666666667, + "Vietnamese,Malay,Spanish,Chinese": 0.23333333333333334, + "Vietnamese,Malay,Spanish,Filipino": 0.38, + "Vietnamese,Malay,Chinese,Filipino": 0.21333333333333335, + "Vietnamese,Indonesian,Spanish,Chinese": 0.24666666666666667, + "Vietnamese,Indonesian,Spanish,Filipino": 0.36666666666666664, + "Vietnamese,Indonesian,Chinese,Filipino": 0.21333333333333335, + "Vietnamese,Spanish,Chinese,Filipino": 0.2, + "Malay,Indonesian,Spanish,Chinese": 0.30666666666666664, + "Malay,Indonesian,Spanish,Filipino": 0.5333333333333333, + "Malay,Indonesian,Chinese,Filipino": 0.28, + "Malay,Spanish,Chinese,Filipino": 0.25333333333333335, + "Indonesian,Spanish,Chinese,Filipino": 0.26666666666666666 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.31333333333333335, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.18, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.28, + "English,Vietnamese,Malay,Spanish,Chinese": 0.18, + "English,Vietnamese,Malay,Spanish,Filipino": 0.28, + "English,Vietnamese,Malay,Chinese,Filipino": 0.16666666666666666, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.19333333333333333, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.2733333333333333, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.16666666666666666, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.16666666666666666, + "English,Malay,Indonesian,Spanish,Chinese": 0.23333333333333334, + "English,Malay,Indonesian,Spanish,Filipino": 0.37333333333333335, + "English,Malay,Indonesian,Chinese,Filipino": 0.20666666666666667, + "English,Malay,Spanish,Chinese,Filipino": 0.19333333333333333, + "English,Indonesian,Spanish,Chinese,Filipino": 0.20666666666666667, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.22666666666666666, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.35333333333333333, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.19333333333333333, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.18666666666666668, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.19333333333333333, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.24666666666666667 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.18, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.26, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.15333333333333332, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.15333333333333332, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.16666666666666666, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.19333333333333333, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.18 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.15333333333333332 + } + }, + "AC3_2": 0.40519268660051827, + "AC3_3": 0.3509588865859282, + "AC3_4": 0.30511351651352137, + "AC3_5": 0.2654249617727118, + "AC3_6": 0.2322173754532861, + "AC3_7": 0.2063143630996009 + }, + "prompt_5": { + "overall_acc": 0.3180952380952381, + "language_acc": { + "English": 0.44666666666666666, + "Vietnamese": 0.3, + "Malay": 0.26666666666666666, + "Indonesian": 0.2733333333333333, + "Spanish": 0.3, + "Chinese": 0.3333333333333333, + "Filipino": 0.30666666666666664 + }, + "consistency_score_2": 0.5619047619047619, + "consistency_score_3": 0.3887619047619048, + "consistency_score_4": 0.2864761904761905, + "consistency_score_5": 0.2174603174603175, + "consistency_score_6": 0.16857142857142857, + "consistency_score_7": 0.13333333333333333, + "detailed_consistency_score": { + "2_combine": { + "English,Vietnamese": 0.4533333333333333, + "English,Malay": 0.6, + "English,Indonesian": 0.5733333333333334, + "English,Spanish": 0.5533333333333333, + "English,Chinese": 0.42, + "English,Filipino": 0.5933333333333334, + "Vietnamese,Malay": 0.5266666666666666, + "Vietnamese,Indonesian": 0.5, + "Vietnamese,Spanish": 0.49333333333333335, + "Vietnamese,Chinese": 0.4533333333333333, + "Vietnamese,Filipino": 0.5066666666666667, + "Malay,Indonesian": 0.8933333333333333, + "Malay,Spanish": 0.7266666666666667, + "Malay,Chinese": 0.4, + "Malay,Filipino": 0.74, + "Indonesian,Spanish": 0.7266666666666667, + "Indonesian,Chinese": 0.4266666666666667, + "Indonesian,Filipino": 0.7266666666666667, + "Spanish,Chinese": 0.46, + "Spanish,Filipino": 0.6066666666666667, + "Chinese,Filipino": 0.42 + }, + "3_combine": { + "English,Vietnamese,Malay": 0.34, + "English,Vietnamese,Indonesian": 0.32666666666666666, + "English,Vietnamese,Spanish": 0.3, + "English,Vietnamese,Chinese": 0.24, + "English,Vietnamese,Filipino": 0.3333333333333333, + "English,Malay,Indonesian": 0.5466666666666666, + "English,Malay,Spanish": 0.47333333333333333, + "English,Malay,Chinese": 0.26666666666666666, + "English,Malay,Filipino": 0.49333333333333335, + "English,Indonesian,Spanish": 0.4666666666666667, + "English,Indonesian,Chinese": 0.28, + "English,Indonesian,Filipino": 0.47333333333333333, + "English,Spanish,Chinese": 0.28, + "English,Spanish,Filipino": 0.4066666666666667, + "English,Chinese,Filipino": 0.2733333333333333, + "Vietnamese,Malay,Indonesian": 0.47333333333333333, + "Vietnamese,Malay,Spanish": 0.42, + "Vietnamese,Malay,Chinese": 0.2733333333333333, + "Vietnamese,Malay,Filipino": 0.4066666666666667, + "Vietnamese,Indonesian,Spanish": 0.41333333333333333, + "Vietnamese,Indonesian,Chinese": 0.2733333333333333, + "Vietnamese,Indonesian,Filipino": 0.3933333333333333, + "Vietnamese,Spanish,Chinese": 0.2866666666666667, + "Vietnamese,Spanish,Filipino": 0.3466666666666667, + "Vietnamese,Chinese,Filipino": 0.2733333333333333, + "Malay,Indonesian,Spanish": 0.6866666666666666, + "Malay,Indonesian,Chinese": 0.38, + "Malay,Indonesian,Filipino": 0.68, + "Malay,Spanish,Chinese": 0.35333333333333333, + "Malay,Spanish,Filipino": 0.56, + "Malay,Chinese,Filipino": 0.32, + "Indonesian,Spanish,Chinese": 0.36666666666666664, + "Indonesian,Spanish,Filipino": 0.56, + "Indonesian,Chinese,Filipino": 0.3333333333333333, + "Spanish,Chinese,Filipino": 0.30666666666666664 + }, + "4_combine": { + "English,Vietnamese,Malay,Indonesian": 0.31333333333333335, + "English,Vietnamese,Malay,Spanish": 0.28, + "English,Vietnamese,Malay,Chinese": 0.18, + "English,Vietnamese,Malay,Filipino": 0.29333333333333333, + "English,Vietnamese,Indonesian,Spanish": 0.28, + "English,Vietnamese,Indonesian,Chinese": 0.18, + "English,Vietnamese,Indonesian,Filipino": 0.28, + "English,Vietnamese,Spanish,Chinese": 0.18, + "English,Vietnamese,Spanish,Filipino": 0.24666666666666667, + "English,Vietnamese,Chinese,Filipino": 0.18, + "English,Malay,Indonesian,Spanish": 0.44666666666666666, + "English,Malay,Indonesian,Chinese": 0.26, + "English,Malay,Indonesian,Filipino": 0.4533333333333333, + "English,Malay,Spanish,Chinese": 0.23333333333333334, + "English,Malay,Spanish,Filipino": 0.38, + "English,Malay,Chinese,Filipino": 0.22666666666666666, + "English,Indonesian,Spanish,Chinese": 0.24, + "English,Indonesian,Spanish,Filipino": 0.38, + "English,Indonesian,Chinese,Filipino": 0.23333333333333334, + "English,Spanish,Chinese,Filipino": 0.20666666666666667, + "Vietnamese,Malay,Indonesian,Spanish": 0.3933333333333333, + "Vietnamese,Malay,Indonesian,Chinese": 0.25333333333333335, + "Vietnamese,Malay,Indonesian,Filipino": 0.36666666666666664, + "Vietnamese,Malay,Spanish,Chinese": 0.25333333333333335, + "Vietnamese,Malay,Spanish,Filipino": 0.32666666666666666, + "Vietnamese,Malay,Chinese,Filipino": 0.21333333333333335, + "Vietnamese,Indonesian,Spanish,Chinese": 0.25333333333333335, + "Vietnamese,Indonesian,Spanish,Filipino": 0.32666666666666666, + "Vietnamese,Indonesian,Chinese,Filipino": 0.22, + "Vietnamese,Spanish,Chinese,Filipino": 0.20666666666666667, + "Malay,Indonesian,Spanish,Chinese": 0.34, + "Malay,Indonesian,Spanish,Filipino": 0.5333333333333333, + "Malay,Indonesian,Chinese,Filipino": 0.30666666666666664, + "Malay,Spanish,Chinese,Filipino": 0.2733333333333333, + "Indonesian,Spanish,Chinese,Filipino": 0.2866666666666667 + }, + "5_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish": 0.26666666666666666, + "English,Vietnamese,Malay,Indonesian,Chinese": 0.17333333333333334, + "English,Vietnamese,Malay,Indonesian,Filipino": 0.26666666666666666, + "English,Vietnamese,Malay,Spanish,Chinese": 0.16666666666666666, + "English,Vietnamese,Malay,Spanish,Filipino": 0.23333333333333334, + "English,Vietnamese,Malay,Chinese,Filipino": 0.15333333333333332, + "English,Vietnamese,Indonesian,Spanish,Chinese": 0.16666666666666666, + "English,Vietnamese,Indonesian,Spanish,Filipino": 0.23333333333333334, + "English,Vietnamese,Indonesian,Chinese,Filipino": 0.15333333333333332, + "English,Vietnamese,Spanish,Chinese,Filipino": 0.14666666666666667, + "English,Malay,Indonesian,Spanish,Chinese": 0.22666666666666666, + "English,Malay,Indonesian,Spanish,Filipino": 0.36, + "English,Malay,Indonesian,Chinese,Filipino": 0.22, + "English,Malay,Spanish,Chinese,Filipino": 0.19333333333333333, + "English,Indonesian,Spanish,Chinese,Filipino": 0.2, + "Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.24, + "Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.30666666666666664, + "Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.2, + "Vietnamese,Malay,Spanish,Chinese,Filipino": 0.19333333333333333, + "Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.2, + "Malay,Indonesian,Spanish,Chinese,Filipino": 0.26666666666666666 + }, + "6_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese": 0.16, + "English,Vietnamese,Malay,Indonesian,Spanish,Filipino": 0.22, + "English,Vietnamese,Malay,Indonesian,Chinese,Filipino": 0.14666666666666667, + "English,Vietnamese,Malay,Spanish,Chinese,Filipino": 0.14, + "English,Vietnamese,Indonesian,Spanish,Chinese,Filipino": 0.14, + "English,Malay,Indonesian,Spanish,Chinese,Filipino": 0.18666666666666668, + "Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.18666666666666668 + }, + "7_combine": { + "English,Vietnamese,Malay,Indonesian,Spanish,Chinese,Filipino": 0.13333333333333333 + } + }, + "AC3_2": 0.4062255204650728, + "AC3_3": 0.34989619017005297, + "AC3_4": 0.30145887713680225, + "AC3_5": 0.25832274813817696, + "AC3_6": 0.22036343299917047, + "AC3_7": 0.18790436001463443 + } }, - "mrpc": { - "prompt_1": -1, - "prompt_2": -1, + "cross_logiqa": { + "prompt_1": { + "overall_acc": 0.27435064935064934, + "language_acc": { + "Vietnamese": 0.3125, + "Indonesian": 0.2556818181818182, + "Malay": 0.26704545454545453, + "English": 0.26704545454545453, + "Spanish": 0.2556818181818182, + "Filipino": 0.29545454545454547, + "Chinese": 0.26704545454545453 + }, + "consistency_score_2": 0.5367965367965367, + "consistency_score_3": 0.3603896103896104, + "consistency_score_4": 0.2576298701298702, + "consistency_score_5": 0.1899350649350649, + "consistency_score_6": 0.14285714285714285, + "consistency_score_7": 0.10795454545454546, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.3806818181818182, + "Vietnamese,Malay": 0.5, + "Vietnamese,English": 0.5, + "Vietnamese,Spanish": 0.5340909090909091, + "Vietnamese,Filipino": 0.4772727272727273, + "Vietnamese,Chinese": 0.4147727272727273, + "Indonesian,Malay": 0.5454545454545454, + "Indonesian,English": 0.5795454545454546, + "Indonesian,Spanish": 0.5454545454545454, + "Indonesian,Filipino": 0.5, + "Indonesian,Chinese": 0.48295454545454547, + "Malay,English": 0.6306818181818182, + "Malay,Spanish": 0.5738636363636364, + "Malay,Filipino": 0.5170454545454546, + "Malay,Chinese": 0.5170454545454546, + "English,Spanish": 0.6590909090909091, + "English,Filipino": 0.625, + "English,Chinese": 0.625, + "Spanish,Filipino": 0.6306818181818182, + "Spanish,Chinese": 0.4943181818181818, + "Filipino,Chinese": 0.5397727272727273 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.29545454545454547, + "Vietnamese,Indonesian,English": 0.2840909090909091, + "Vietnamese,Indonesian,Spanish": 0.2897727272727273, + "Vietnamese,Indonesian,Filipino": 0.2556818181818182, + "Vietnamese,Indonesian,Chinese": 0.2215909090909091, + "Vietnamese,Malay,English": 0.3806818181818182, + "Vietnamese,Malay,Spanish": 0.36363636363636365, + "Vietnamese,Malay,Filipino": 0.30113636363636365, + "Vietnamese,Malay,Chinese": 0.2727272727272727, + "Vietnamese,English,Spanish": 0.3977272727272727, + "Vietnamese,English,Filipino": 0.3522727272727273, + "Vietnamese,English,Chinese": 0.32954545454545453, + "Vietnamese,Spanish,Filipino": 0.3693181818181818, + "Vietnamese,Spanish,Chinese": 0.29545454545454547, + "Vietnamese,Filipino,Chinese": 0.2840909090909091, + "Indonesian,Malay,English": 0.4318181818181818, + "Indonesian,Malay,Spanish": 0.3977272727272727, + "Indonesian,Malay,Filipino": 0.3522727272727273, + "Indonesian,Malay,Chinese": 0.3352272727272727, + "Indonesian,English,Spanish": 0.4318181818181818, + "Indonesian,English,Filipino": 0.39204545454545453, + "Indonesian,English,Chinese": 0.3977272727272727, + "Indonesian,Spanish,Filipino": 0.3806818181818182, + "Indonesian,Spanish,Chinese": 0.32386363636363635, + "Indonesian,Filipino,Chinese": 0.3181818181818182, + "Malay,English,Spanish": 0.4602272727272727, + "Malay,English,Filipino": 0.42613636363636365, + "Malay,English,Chinese": 0.42613636363636365, + "Malay,Spanish,Filipino": 0.4147727272727273, + "Malay,Spanish,Chinese": 0.3409090909090909, + "Malay,Filipino,Chinese": 0.3409090909090909, + "English,Spanish,Filipino": 0.4943181818181818, + "English,Spanish,Chinese": 0.4375, + "English,Filipino,Chinese": 0.4375, + "Spanish,Filipino,Chinese": 0.3806818181818182 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.25, + "Vietnamese,Indonesian,Malay,Spanish": 0.25, + "Vietnamese,Indonesian,Malay,Filipino": 0.20454545454545456, + "Vietnamese,Indonesian,Malay,Chinese": 0.18181818181818182, + "Vietnamese,Indonesian,English,Spanish": 0.25, + "Vietnamese,Indonesian,English,Filipino": 0.21022727272727273, + "Vietnamese,Indonesian,English,Chinese": 0.19886363636363635, + "Vietnamese,Indonesian,Spanish,Filipino": 0.2215909090909091, + "Vietnamese,Indonesian,Spanish,Chinese": 0.19318181818181818, + "Vietnamese,Indonesian,Filipino,Chinese": 0.16477272727272727, + "Vietnamese,Malay,English,Spanish": 0.3068181818181818, + "Vietnamese,Malay,English,Filipino": 0.2556818181818182, + "Vietnamese,Malay,English,Chinese": 0.2556818181818182, + "Vietnamese,Malay,Spanish,Filipino": 0.26136363636363635, + "Vietnamese,Malay,Spanish,Chinese": 0.2159090909090909, + "Vietnamese,Malay,Filipino,Chinese": 0.1875, + "Vietnamese,English,Spanish,Filipino": 0.30113636363636365, + "Vietnamese,English,Spanish,Chinese": 0.26136363636363635, + "Vietnamese,English,Filipino,Chinese": 0.23863636363636365, + "Vietnamese,Spanish,Filipino,Chinese": 0.22727272727272727, + "Indonesian,Malay,English,Spanish": 0.3352272727272727, + "Indonesian,Malay,English,Filipino": 0.29545454545454547, + "Indonesian,Malay,English,Chinese": 0.2897727272727273, + "Indonesian,Malay,Spanish,Filipino": 0.2840909090909091, + "Indonesian,Malay,Spanish,Chinese": 0.25, + "Indonesian,Malay,Filipino,Chinese": 0.23295454545454544, + "Indonesian,English,Spanish,Filipino": 0.32386363636363635, + "Indonesian,English,Spanish,Chinese": 0.29545454545454547, + "Indonesian,English,Filipino,Chinese": 0.26704545454545453, + "Indonesian,Spanish,Filipino,Chinese": 0.24431818181818182, + "Malay,English,Spanish,Filipino": 0.3465909090909091, + "Malay,English,Spanish,Chinese": 0.3125, + "Malay,English,Filipino,Chinese": 0.30113636363636365, + "Malay,Spanish,Filipino,Chinese": 0.26704545454545453, + "English,Spanish,Filipino,Chinese": 0.3352272727272727 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.2215909090909091, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.17613636363636365, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.17045454545454544, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.18181818181818182, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.16477272727272727, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.13068181818181818, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.1875, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.17613636363636365, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.14772727272727273, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.14772727272727273, + "Vietnamese,Malay,English,Spanish,Filipino": 0.22727272727272727, + "Vietnamese,Malay,English,Spanish,Chinese": 0.20454545454545456, + "Vietnamese,Malay,English,Filipino,Chinese": 0.17613636363636365, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.16477272727272727, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.19886363636363635, + "Indonesian,Malay,English,Spanish,Filipino": 0.24431818181818182, + "Indonesian,Malay,English,Spanish,Chinese": 0.22727272727272727, + "Indonesian,Malay,English,Filipino,Chinese": 0.19886363636363635, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.1875, + "Indonesian,English,Spanish,Filipino,Chinese": 0.2159090909090909, + "Malay,English,Spanish,Filipino,Chinese": 0.23863636363636365 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.1590909090909091, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.1534090909090909, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.11931818181818182, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.11931818181818182, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.13068181818181818, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.1534090909090909, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.16477272727272727 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.10795454545454546 + } + }, + "AC3_2": 0.36311653652073667, + "AC3_3": 0.31153884473784527, + "AC3_4": 0.26572748272055935, + "AC3_5": 0.2244687130567296, + "AC3_6": 0.18788215670871883, + "AC3_7": 0.1549411310152526 + }, + "prompt_2": { + "overall_acc": 0.26866883116883117, + "language_acc": { + "Vietnamese": 0.30113636363636365, + "Indonesian": 0.2727272727272727, + "Malay": 0.23863636363636365, + "English": 0.26704545454545453, + "Spanish": 0.26704545454545453, + "Filipino": 0.2727272727272727, + "Chinese": 0.26136363636363635 + }, + "consistency_score_2": 0.5411255411255411, + "consistency_score_3": 0.3728896103896104, + "consistency_score_4": 0.2717532467532468, + "consistency_score_5": 0.2021103896103896, + "consistency_score_6": 0.15178571428571427, + "consistency_score_7": 0.11363636363636363, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.4602272727272727, + "Vietnamese,Malay": 0.5397727272727273, + "Vietnamese,English": 0.5795454545454546, + "Vietnamese,Spanish": 0.5738636363636364, + "Vietnamese,Filipino": 0.5284090909090909, + "Vietnamese,Chinese": 0.5170454545454546, + "Indonesian,Malay": 0.5340909090909091, + "Indonesian,English": 0.4659090909090909, + "Indonesian,Spanish": 0.4659090909090909, + "Indonesian,Filipino": 0.42045454545454547, + "Indonesian,Chinese": 0.4431818181818182, + "Malay,English": 0.5795454545454546, + "Malay,Spanish": 0.5568181818181818, + "Malay,Filipino": 0.5625, + "Malay,Chinese": 0.5397727272727273, + "English,Spanish": 0.6647727272727273, + "English,Filipino": 0.6079545454545454, + "English,Chinese": 0.5965909090909091, + "Spanish,Filipino": 0.6193181818181818, + "Spanish,Chinese": 0.5340909090909091, + "Filipino,Chinese": 0.5738636363636364 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.3522727272727273, + "Vietnamese,Indonesian,English": 0.3181818181818182, + "Vietnamese,Indonesian,Spanish": 0.32954545454545453, + "Vietnamese,Indonesian,Filipino": 0.2784090909090909, + "Vietnamese,Indonesian,Chinese": 0.29545454545454547, + "Vietnamese,Malay,English": 0.42045454545454547, + "Vietnamese,Malay,Spanish": 0.38636363636363635, + "Vietnamese,Malay,Filipino": 0.35795454545454547, + "Vietnamese,Malay,Chinese": 0.36363636363636365, + "Vietnamese,English,Spanish": 0.4659090909090909, + "Vietnamese,English,Filipino": 0.4147727272727273, + "Vietnamese,English,Chinese": 0.4034090909090909, + "Vietnamese,Spanish,Filipino": 0.4090909090909091, + "Vietnamese,Spanish,Chinese": 0.375, + "Vietnamese,Filipino,Chinese": 0.3522727272727273, + "Indonesian,Malay,English": 0.35795454545454547, + "Indonesian,Malay,Spanish": 0.3522727272727273, + "Indonesian,Malay,Filipino": 0.3352272727272727, + "Indonesian,Malay,Chinese": 0.3465909090909091, + "Indonesian,English,Spanish": 0.35795454545454547, + "Indonesian,English,Filipino": 0.3125, + "Indonesian,English,Chinese": 0.32954545454545453, + "Indonesian,Spanish,Filipino": 0.3068181818181818, + "Indonesian,Spanish,Chinese": 0.3068181818181818, + "Indonesian,Filipino,Chinese": 0.30113636363636365, + "Malay,English,Spanish": 0.45454545454545453, + "Malay,English,Filipino": 0.4147727272727273, + "Malay,English,Chinese": 0.3977272727272727, + "Malay,Spanish,Filipino": 0.4090909090909091, + "Malay,Spanish,Chinese": 0.375, + "Malay,Filipino,Chinese": 0.3806818181818182, + "English,Spanish,Filipino": 0.4943181818181818, + "English,Spanish,Chinese": 0.45454545454545453, + "English,Filipino,Chinese": 0.42613636363636365, + "Spanish,Filipino,Chinese": 0.4147727272727273 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.26704545454545453, + "Vietnamese,Indonesian,Malay,Spanish": 0.2727272727272727, + "Vietnamese,Indonesian,Malay,Filipino": 0.22727272727272727, + "Vietnamese,Indonesian,Malay,Chinese": 0.23863636363636365, + "Vietnamese,Indonesian,English,Spanish": 0.2727272727272727, + "Vietnamese,Indonesian,English,Filipino": 0.2215909090909091, + "Vietnamese,Indonesian,English,Chinese": 0.23863636363636365, + "Vietnamese,Indonesian,Spanish,Filipino": 0.23863636363636365, + "Vietnamese,Indonesian,Spanish,Chinese": 0.24431818181818182, + "Vietnamese,Indonesian,Filipino,Chinese": 0.20454545454545456, + "Vietnamese,Malay,English,Spanish": 0.3352272727272727, + "Vietnamese,Malay,English,Filipino": 0.29545454545454547, + "Vietnamese,Malay,English,Chinese": 0.2897727272727273, + "Vietnamese,Malay,Spanish,Filipino": 0.2840909090909091, + "Vietnamese,Malay,Spanish,Chinese": 0.26704545454545453, + "Vietnamese,Malay,Filipino,Chinese": 0.24431818181818182, + "Vietnamese,English,Spanish,Filipino": 0.3465909090909091, + "Vietnamese,English,Spanish,Chinese": 0.32386363636363635, + "Vietnamese,English,Filipino,Chinese": 0.29545454545454547, + "Vietnamese,Spanish,Filipino,Chinese": 0.2840909090909091, + "Indonesian,Malay,English,Spanish": 0.29545454545454547, + "Indonesian,Malay,English,Filipino": 0.25, + "Indonesian,Malay,English,Chinese": 0.26136363636363635, + "Indonesian,Malay,Spanish,Filipino": 0.24431818181818182, + "Indonesian,Malay,Spanish,Chinese": 0.2556818181818182, + "Indonesian,Malay,Filipino,Chinese": 0.25, + "Indonesian,English,Spanish,Filipino": 0.26136363636363635, + "Indonesian,English,Spanish,Chinese": 0.26704545454545453, + "Indonesian,English,Filipino,Chinese": 0.23863636363636365, + "Indonesian,Spanish,Filipino,Chinese": 0.22727272727272727, + "Malay,English,Spanish,Filipino": 0.3352272727272727, + "Malay,English,Spanish,Chinese": 0.3181818181818182, + "Malay,English,Filipino,Chinese": 0.2840909090909091, + "Malay,Spanish,Filipino,Chinese": 0.2784090909090909, + "English,Spanish,Filipino,Chinese": 0.3522727272727273 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.22727272727272727, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.17613636363636365, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.19318181818181818, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.1875, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.19886363636363635, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.16477272727272727, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.19886363636363635, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.21022727272727273, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.17045454545454544, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.17613636363636365, + "Vietnamese,Malay,English,Spanish,Filipino": 0.24431818181818182, + "Vietnamese,Malay,English,Spanish,Chinese": 0.23295454545454544, + "Vietnamese,Malay,English,Filipino,Chinese": 0.19886363636363635, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.19318181818181818, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.25, + "Indonesian,Malay,English,Spanish,Filipino": 0.20454545454545456, + "Indonesian,Malay,English,Spanish,Chinese": 0.2215909090909091, + "Indonesian,Malay,English,Filipino,Chinese": 0.1875, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.18181818181818182, + "Indonesian,English,Spanish,Filipino,Chinese": 0.19886363636363635, + "Malay,English,Spanish,Filipino,Chinese": 0.22727272727272727 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.1534090909090909, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.17045454545454544, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.13068181818181818, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.13636363636363635, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.1534090909090909, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.16477272727272727, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.1534090909090909 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.11363636363636363 + } + }, + "AC3_2": 0.3590629216647561, + "AC3_3": 0.3123139195843181, + "AC3_4": 0.27020223688398987, + "AC3_5": 0.23068461705803028, + "AC3_6": 0.19398097072054904, + "AC3_7": 0.15971820107769732 + }, "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - } - }, - "five_shot": { - "cross_mmlu": { - "prompt_1": -1 - }, - "cross_logiqa": { - "prompt_1": -1 + "prompt_4": { + "overall_acc": 0.2784090909090909, + "language_acc": { + "Vietnamese": 0.2840909090909091, + "Indonesian": 0.2556818181818182, + "Malay": 0.26704545454545453, + "English": 0.3068181818181818, + "Spanish": 0.29545454545454547, + "Filipino": 0.2784090909090909, + "Chinese": 0.26136363636363635 + }, + "consistency_score_2": 0.5232683982683982, + "consistency_score_3": 0.3470779220779221, + "consistency_score_4": 0.2475649350649351, + "consistency_score_5": 0.1818181818181818, + "consistency_score_6": 0.13392857142857142, + "consistency_score_7": 0.09659090909090909, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.375, + "Vietnamese,Malay": 0.44886363636363635, + "Vietnamese,English": 0.5, + "Vietnamese,Spanish": 0.4943181818181818, + "Vietnamese,Filipino": 0.4659090909090909, + "Vietnamese,Chinese": 0.44886363636363635, + "Indonesian,Malay": 0.5852272727272727, + "Indonesian,English": 0.5681818181818182, + "Indonesian,Spanish": 0.5, + "Indonesian,Filipino": 0.4602272727272727, + "Indonesian,Chinese": 0.5170454545454546, + "Malay,English": 0.6193181818181818, + "Malay,Spanish": 0.5340909090909091, + "Malay,Filipino": 0.5340909090909091, + "Malay,Chinese": 0.4772727272727273, + "English,Spanish": 0.6306818181818182, + "English,Filipino": 0.625, + "English,Chinese": 0.625, + "Spanish,Filipino": 0.5965909090909091, + "Spanish,Chinese": 0.4943181818181818, + "Filipino,Chinese": 0.48863636363636365 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.26704545454545453, + "Vietnamese,Indonesian,English": 0.2840909090909091, + "Vietnamese,Indonesian,Spanish": 0.25, + "Vietnamese,Indonesian,Filipino": 0.24431818181818182, + "Vietnamese,Indonesian,Chinese": 0.25, + "Vietnamese,Malay,English": 0.3409090909090909, + "Vietnamese,Malay,Spanish": 0.29545454545454547, + "Vietnamese,Malay,Filipino": 0.2897727272727273, + "Vietnamese,Malay,Chinese": 0.2556818181818182, + "Vietnamese,English,Spanish": 0.3522727272727273, + "Vietnamese,English,Filipino": 0.3522727272727273, + "Vietnamese,English,Chinese": 0.3409090909090909, + "Vietnamese,Spanish,Filipino": 0.3352272727272727, + "Vietnamese,Spanish,Chinese": 0.2897727272727273, + "Vietnamese,Filipino,Chinese": 0.2784090909090909, + "Indonesian,Malay,English": 0.44886363636363635, + "Indonesian,Malay,Spanish": 0.375, + "Indonesian,Malay,Filipino": 0.3693181818181818, + "Indonesian,Malay,Chinese": 0.3522727272727273, + "Indonesian,English,Spanish": 0.39204545454545453, + "Indonesian,English,Filipino": 0.375, + "Indonesian,English,Chinese": 0.4090909090909091, + "Indonesian,Spanish,Filipino": 0.3409090909090909, + "Indonesian,Spanish,Chinese": 0.3181818181818182, + "Indonesian,Filipino,Chinese": 0.30113636363636365, + "Malay,English,Spanish": 0.4318181818181818, + "Malay,English,Filipino": 0.4318181818181818, + "Malay,English,Chinese": 0.42045454545454547, + "Malay,Spanish,Filipino": 0.39204545454545453, + "Malay,Spanish,Chinese": 0.3409090909090909, + "Malay,Filipino,Chinese": 0.3352272727272727, + "English,Spanish,Filipino": 0.4943181818181818, + "English,Spanish,Chinese": 0.4090909090909091, + "English,Filipino,Chinese": 0.42045454545454547, + "Spanish,Filipino,Chinese": 0.36363636363636365 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.22727272727272727, + "Vietnamese,Indonesian,Malay,Spanish": 0.18181818181818182, + "Vietnamese,Indonesian,Malay,Filipino": 0.18181818181818182, + "Vietnamese,Indonesian,Malay,Chinese": 0.17613636363636365, + "Vietnamese,Indonesian,English,Spanish": 0.2159090909090909, + "Vietnamese,Indonesian,English,Filipino": 0.19886363636363635, + "Vietnamese,Indonesian,English,Chinese": 0.2159090909090909, + "Vietnamese,Indonesian,Spanish,Filipino": 0.1875, + "Vietnamese,Indonesian,Spanish,Chinese": 0.18181818181818182, + "Vietnamese,Indonesian,Filipino,Chinese": 0.1590909090909091, + "Vietnamese,Malay,English,Spanish": 0.25, + "Vietnamese,Malay,English,Filipino": 0.23295454545454544, + "Vietnamese,Malay,English,Chinese": 0.23863636363636365, + "Vietnamese,Malay,Spanish,Filipino": 0.2159090909090909, + "Vietnamese,Malay,Spanish,Chinese": 0.19886363636363635, + "Vietnamese,Malay,Filipino,Chinese": 0.1875, + "Vietnamese,English,Spanish,Filipino": 0.2840909090909091, + "Vietnamese,English,Spanish,Chinese": 0.25, + "Vietnamese,English,Filipino,Chinese": 0.25, + "Vietnamese,Spanish,Filipino,Chinese": 0.21022727272727273, + "Indonesian,Malay,English,Spanish": 0.32954545454545453, + "Indonesian,Malay,English,Filipino": 0.3181818181818182, + "Indonesian,Malay,English,Chinese": 0.3181818181818182, + "Indonesian,Malay,Spanish,Filipino": 0.2784090909090909, + "Indonesian,Malay,Spanish,Chinese": 0.26136363636363635, + "Indonesian,Malay,Filipino,Chinese": 0.24431818181818182, + "Indonesian,English,Spanish,Filipino": 0.30113636363636365, + "Indonesian,English,Spanish,Chinese": 0.2840909090909091, + "Indonesian,English,Filipino,Chinese": 0.26704545454545453, + "Indonesian,Spanish,Filipino,Chinese": 0.23295454545454544, + "Malay,English,Spanish,Filipino": 0.3465909090909091, + "Malay,English,Spanish,Chinese": 0.3181818181818182, + "Malay,English,Filipino,Chinese": 0.3068181818181818, + "Malay,Spanish,Filipino,Chinese": 0.2784090909090909, + "English,Spanish,Filipino,Chinese": 0.3352272727272727 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.17045454545454544, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.1534090909090909, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.17045454545454544, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.13636363636363635, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.13636363636363635, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.11931818181818182, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.16477272727272727, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.17045454545454544, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.14772727272727273, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.125, + "Vietnamese,Malay,English,Spanish,Filipino": 0.19318181818181818, + "Vietnamese,Malay,English,Spanish,Chinese": 0.19318181818181818, + "Vietnamese,Malay,English,Filipino,Chinese": 0.17045454545454544, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.1534090909090909, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.19886363636363635, + "Indonesian,Malay,English,Spanish,Filipino": 0.2556818181818182, + "Indonesian,Malay,English,Spanish,Chinese": 0.24431818181818182, + "Indonesian,Malay,English,Filipino,Chinese": 0.22727272727272727, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.20454545454545456, + "Indonesian,English,Spanish,Filipino,Chinese": 0.2215909090909091, + "Malay,English,Spanish,Filipino,Chinese": 0.26136363636363635 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.125, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.13636363636363635, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.11363636363636363, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.09659090909090909, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.125, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.14772727272727273, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.19318181818181818 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.09659090909090909 + } + }, + "AC3_2": 0.3634446046243788, + "AC3_3": 0.3089741169314676, + "AC3_4": 0.2620826318244705, + "AC3_5": 0.21997755326308902, + "AC3_6": 0.1808562991687372, + "AC3_7": 0.14342286497552797 + }, + "prompt_5": { + "overall_acc": 0.28977272727272724, + "language_acc": { + "Vietnamese": 0.30113636363636365, + "Indonesian": 0.3068181818181818, + "Malay": 0.2840909090909091, + "English": 0.3125, + "Spanish": 0.2897727272727273, + "Filipino": 0.26704545454545453, + "Chinese": 0.26704545454545453 + }, + "consistency_score_2": 0.5338203463203464, + "consistency_score_3": 0.35876623376623373, + "consistency_score_4": 0.2577922077922078, + "consistency_score_5": 0.19101731601731603, + "consistency_score_6": 0.1436688311688312, + "consistency_score_7": 0.10795454545454546, + "detailed_consistency_score": { + "2_combine": { + "Vietnamese,Indonesian": 0.3977272727272727, + "Vietnamese,Malay": 0.48863636363636365, + "Vietnamese,English": 0.48863636363636365, + "Vietnamese,Spanish": 0.5, + "Vietnamese,Filipino": 0.4375, + "Vietnamese,Chinese": 0.42613636363636365, + "Indonesian,Malay": 0.5852272727272727, + "Indonesian,English": 0.6022727272727273, + "Indonesian,Spanish": 0.5397727272727273, + "Indonesian,Filipino": 0.5454545454545454, + "Indonesian,Chinese": 0.5340909090909091, + "Malay,English": 0.6193181818181818, + "Malay,Spanish": 0.5284090909090909, + "Malay,Filipino": 0.5397727272727273, + "Malay,Chinese": 0.4772727272727273, + "English,Spanish": 0.6477272727272727, + "English,Filipino": 0.6477272727272727, + "English,Chinese": 0.5568181818181818, + "Spanish,Filipino": 0.625, + "Spanish,Chinese": 0.48295454545454547, + "Filipino,Chinese": 0.5397727272727273 + }, + "3_combine": { + "Vietnamese,Indonesian,Malay": 0.2897727272727273, + "Vietnamese,Indonesian,English": 0.30113636363636365, + "Vietnamese,Indonesian,Spanish": 0.2897727272727273, + "Vietnamese,Indonesian,Filipino": 0.2556818181818182, + "Vietnamese,Indonesian,Chinese": 0.26136363636363635, + "Vietnamese,Malay,English": 0.3409090909090909, + "Vietnamese,Malay,Spanish": 0.3181818181818182, + "Vietnamese,Malay,Filipino": 0.2897727272727273, + "Vietnamese,Malay,Chinese": 0.2784090909090909, + "Vietnamese,English,Spanish": 0.3693181818181818, + "Vietnamese,English,Filipino": 0.32954545454545453, + "Vietnamese,English,Chinese": 0.29545454545454547, + "Vietnamese,Spanish,Filipino": 0.3409090909090909, + "Vietnamese,Spanish,Chinese": 0.2784090909090909, + "Vietnamese,Filipino,Chinese": 0.2784090909090909, + "Indonesian,Malay,English": 0.4602272727272727, + "Indonesian,Malay,Spanish": 0.38636363636363635, + "Indonesian,Malay,Filipino": 0.3977272727272727, + "Indonesian,Malay,Chinese": 0.3465909090909091, + "Indonesian,English,Spanish": 0.4431818181818182, + "Indonesian,English,Filipino": 0.4375, + "Indonesian,English,Chinese": 0.4034090909090909, + "Indonesian,Spanish,Filipino": 0.4090909090909091, + "Indonesian,Spanish,Chinese": 0.3465909090909091, + "Indonesian,Filipino,Chinese": 0.36363636363636365, + "Malay,English,Spanish": 0.4375, + "Malay,English,Filipino": 0.4431818181818182, + "Malay,English,Chinese": 0.375, + "Malay,Spanish,Filipino": 0.4090909090909091, + "Malay,Spanish,Chinese": 0.32954545454545453, + "Malay,Filipino,Chinese": 0.35795454545454547, + "English,Spanish,Filipino": 0.5113636363636364, + "English,Spanish,Chinese": 0.38636363636363635, + "English,Filipino,Chinese": 0.4090909090909091, + "Spanish,Filipino,Chinese": 0.38636363636363635 + }, + "4_combine": { + "Vietnamese,Indonesian,Malay,English": 0.24431818181818182, + "Vietnamese,Indonesian,Malay,Spanish": 0.23295454545454544, + "Vietnamese,Indonesian,Malay,Filipino": 0.2159090909090909, + "Vietnamese,Indonesian,Malay,Chinese": 0.19318181818181818, + "Vietnamese,Indonesian,English,Spanish": 0.25, + "Vietnamese,Indonesian,English,Filipino": 0.2159090909090909, + "Vietnamese,Indonesian,English,Chinese": 0.20454545454545456, + "Vietnamese,Indonesian,Spanish,Filipino": 0.23295454545454544, + "Vietnamese,Indonesian,Spanish,Chinese": 0.19318181818181818, + "Vietnamese,Indonesian,Filipino,Chinese": 0.17613636363636365, + "Vietnamese,Malay,English,Spanish": 0.26136363636363635, + "Vietnamese,Malay,English,Filipino": 0.23295454545454544, + "Vietnamese,Malay,English,Chinese": 0.2215909090909091, + "Vietnamese,Malay,Spanish,Filipino": 0.24431818181818182, + "Vietnamese,Malay,Spanish,Chinese": 0.21022727272727273, + "Vietnamese,Malay,Filipino,Chinese": 0.21022727272727273, + "Vietnamese,English,Spanish,Filipino": 0.2897727272727273, + "Vietnamese,English,Spanish,Chinese": 0.2215909090909091, + "Vietnamese,English,Filipino,Chinese": 0.20454545454545456, + "Vietnamese,Spanish,Filipino,Chinese": 0.2159090909090909, + "Indonesian,Malay,English,Spanish": 0.3465909090909091, + "Indonesian,Malay,English,Filipino": 0.3409090909090909, + "Indonesian,Malay,English,Chinese": 0.2840909090909091, + "Indonesian,Malay,Spanish,Filipino": 0.3181818181818182, + "Indonesian,Malay,Spanish,Chinese": 0.26136363636363635, + "Indonesian,Malay,Filipino,Chinese": 0.2727272727272727, + "Indonesian,English,Spanish,Filipino": 0.3522727272727273, + "Indonesian,English,Spanish,Chinese": 0.30113636363636365, + "Indonesian,English,Filipino,Chinese": 0.29545454545454547, + "Indonesian,Spanish,Filipino,Chinese": 0.2784090909090909, + "Malay,English,Spanish,Filipino": 0.3522727272727273, + "Malay,English,Spanish,Chinese": 0.2727272727272727, + "Malay,English,Filipino,Chinese": 0.2840909090909091, + "Malay,Spanish,Filipino,Chinese": 0.2784090909090909, + "English,Spanish,Filipino,Chinese": 0.3125 + }, + "5_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish": 0.20454545454545456, + "Vietnamese,Indonesian,Malay,English,Filipino": 0.18181818181818182, + "Vietnamese,Indonesian,Malay,English,Chinese": 0.1590909090909091, + "Vietnamese,Indonesian,Malay,Spanish,Filipino": 0.19886363636363635, + "Vietnamese,Indonesian,Malay,Spanish,Chinese": 0.1590909090909091, + "Vietnamese,Indonesian,Malay,Filipino,Chinese": 0.1534090909090909, + "Vietnamese,Indonesian,English,Spanish,Filipino": 0.19886363636363635, + "Vietnamese,Indonesian,English,Spanish,Chinese": 0.16477272727272727, + "Vietnamese,Indonesian,English,Filipino,Chinese": 0.14204545454545456, + "Vietnamese,Indonesian,Spanish,Filipino,Chinese": 0.1534090909090909, + "Vietnamese,Malay,English,Spanish,Filipino": 0.21022727272727273, + "Vietnamese,Malay,English,Spanish,Chinese": 0.17045454545454544, + "Vietnamese,Malay,English,Filipino,Chinese": 0.1590909090909091, + "Vietnamese,Malay,Spanish,Filipino,Chinese": 0.17613636363636365, + "Vietnamese,English,Spanish,Filipino,Chinese": 0.17045454545454544, + "Indonesian,Malay,English,Spanish,Filipino": 0.2784090909090909, + "Indonesian,Malay,English,Spanish,Chinese": 0.2215909090909091, + "Indonesian,Malay,English,Filipino,Chinese": 0.2215909090909091, + "Indonesian,Malay,Spanish,Filipino,Chinese": 0.2215909090909091, + "Indonesian,English,Spanish,Filipino,Chinese": 0.23863636363636365, + "Malay,English,Spanish,Filipino,Chinese": 0.22727272727272727 + }, + "6_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino": 0.17045454545454544, + "Vietnamese,Indonesian,Malay,English,Spanish,Chinese": 0.13068181818181818, + "Vietnamese,Indonesian,Malay,English,Filipino,Chinese": 0.11931818181818182, + "Vietnamese,Indonesian,Malay,Spanish,Filipino,Chinese": 0.13636363636363635, + "Vietnamese,Indonesian,English,Spanish,Filipino,Chinese": 0.125, + "Vietnamese,Malay,English,Spanish,Filipino,Chinese": 0.14204545454545456, + "Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.18181818181818182 + }, + "7_combine": { + "Vietnamese,Indonesian,Malay,English,Spanish,Filipino,Chinese": 0.10795454545454546 + } + }, + "AC3_2": 0.37563836454643423, + "AC3_3": 0.3205996131033705, + "AC3_4": 0.2728485566706514, + "AC3_5": 0.23025272415531203, + "AC3_6": 0.1920965270241183, + "AC3_7": 0.15730519476564378 + } }, "sg_eval": { - "prompt_1": -1 + "prompt_1": { + "accuracy": 0.39805825242718446 + }, + "prompt_2": { + "accuracy": 0.49514563106796117 + }, + "prompt_3": -1, + "prompt_4": { + "accuracy": 0.3592233009708738 + }, + "prompt_5": { + "accuracy": 0.4563106796116505 + } }, "cn_eval": { - "prompt_1": -1 + "prompt_1": { + "accuracy": 0.21904761904761905 + }, + "prompt_2": { + "accuracy": 0.21904761904761905 + }, + "prompt_3": -1, + "prompt_4": { + "accuracy": 0.21904761904761905 + }, + "prompt_5": { + "accuracy": 0.20952380952380953 + } }, "us_eval": { - "prompt_1": -1 + "prompt_1": { + "accuracy": 0.5233644859813084 + }, + "prompt_2": { + "accuracy": 0.45794392523364486 + }, + "prompt_3": -1, + "prompt_4": { + "accuracy": 0.4672897196261682 + }, + "prompt_5": { + "accuracy": 0.48598130841121495 + } }, "ph_eval": { - "prompt_1": -1 + "prompt_1": { + "accuracy": 0.41, + "category_acc": { + "brand": 0.3, + "demographics": 0.2, + "biology": 0.6, + "history": 0.26666666666666666, + "literature": 0.4, + "politics": 0.5, + "culture": 0.3, + "film": 0.7, + "law": 0.2, + "geography": 0.6 + } + }, + "prompt_2": { + "accuracy": 0.44, + "category_acc": { + "brand": 0.3, + "demographics": 0.0, + "biology": 0.6, + "history": 0.3333333333333333, + "literature": 0.4, + "politics": 0.5, + "culture": 0.4, + "film": 0.5, + "law": 0.5, + "geography": 0.7 + } + }, + "prompt_3": { + "accuracy": 0.41, + "category_acc": { + "brand": 0.3, + "demographics": 0.0, + "biology": 0.6, + "history": 0.3333333333333333, + "literature": 0.3, + "politics": 0.5, + "culture": 0.2, + "film": 0.7, + "law": 0.4, + "geography": 0.6 + } + }, + "prompt_4": { + "accuracy": 0.39, + "category_acc": { + "brand": 0.2, + "demographics": 0.0, + "biology": 0.7, + "history": 0.3333333333333333, + "literature": 0.3, + "politics": 0.5, + "culture": 0.0, + "film": 0.6, + "law": 0.4, + "geography": 0.7 + } + }, + "prompt_5": { + "accuracy": 0.39, + "category_acc": { + "brand": 0.2, + "demographics": 0.0, + "biology": 0.6, + "history": 0.3333333333333333, + "literature": 0.3, + "politics": 0.6, + "culture": 0.3, + "film": 0.5, + "law": 0.4, + "geography": 0.5 + } + } }, "sing2eng": { - "prompt_1": -1 + "prompt_1": { + "bleu_score": 0.19325979062685159 + }, + "prompt_2": { + "bleu_score": 0.19719902017779267 + }, + "prompt_3": { + "bleu_score": 0.20314300474815059 + }, + "prompt_4": { + "bleu_score": 0.19526345749622018 + }, + "prompt_5": { + "bleu_score": 0.09868754065607462 + } }, "flores_ind2eng": { - "prompt_1": -1 + "prompt_1": { + "bleu_score": 0.036679748589490795 + }, + "prompt_2": { + "bleu_score": 0.04730824109026968 + }, + "prompt_3": { + "bleu_score": 0.03716223465101119 + }, + "prompt_4": { + "bleu_score": 0.03995567721020701 + }, + "prompt_5": { + "bleu_score": 0.03901277937421067 + } }, "flores_vie2eng": { - "prompt_1": -1 + "prompt_1": { + "bleu_score": 0.013686452890985893 + }, + "prompt_2": { + "bleu_score": 0.01536493026677872 + }, + "prompt_3": { + "bleu_score": 0.01063369404270174 + }, + "prompt_4": { + "bleu_score": 0.01510255811769833 + }, + "prompt_5": { + "bleu_score": 0.013364368736344938 + } }, "flores_zho2eng": { - "prompt_1": -1 + "prompt_1": { + "bleu_score": 0.04080414274998884 + }, + "prompt_2": { + "bleu_score": 0.059894275340086615 + }, + "prompt_3": { + "bleu_score": 0.04243299674633565 + }, + "prompt_4": { + "bleu_score": 0.04490739901907554 + }, + "prompt_5": { + "bleu_score": 0.04342511011220152 + } }, "flores_zsm2eng": { - "prompt_1": -1 + "prompt_1": { + "bleu_score": 0.03291458077267792 + }, + "prompt_2": { + "bleu_score": 0.04186085104111973 + }, + "prompt_3": { + "bleu_score": 0.03437371102970113 + }, + "prompt_4": { + "bleu_score": 0.036268455238204725 + }, + "prompt_5": { + "bleu_score": 0.03607138242086422 + } }, "mmlu": { - "prompt_1": -1 + "prompt_1": { + "accuracy": 0.4807467911318553 + }, + "prompt_2": { + "accuracy": 0.47607934655775963 + }, + "prompt_3": { + "accuracy": 0.47724620770128356 + }, + "prompt_4": { + "accuracy": 0.5075845974329055 + }, + "prompt_5": { + "accuracy": 0.4982497082847141 + } }, "mmlu_full": { - "prompt_1": -1 + "prompt_1": { + "accuracy": 0.455917053986414, + "category_acc": { + "high_school_european_history": 0.4451219512195122, + "business_ethics": 0.5757575757575758, + "clinical_knowledge": 0.5378787878787878, + "medical_genetics": 0.5555555555555556, + "high_school_us_history": 0.4729064039408867, + "high_school_physics": 0.26, + "high_school_world_history": 0.5042372881355932, + "virology": 0.47878787878787876, + "high_school_microeconomics": 0.4936708860759494, + "econometrics": 0.26548672566371684, + "college_computer_science": 0.40404040404040403, + "high_school_biology": 0.5728155339805825, + "abstract_algebra": 0.25252525252525254, + "professional_accounting": 0.3665480427046263, + "philosophy": 0.45806451612903226, + "professional_medicine": 0.36162361623616235, + "nutrition": 0.5540983606557377, + "global_facts": 0.24242424242424243, + "machine_learning": 0.43243243243243246, + "security_studies": 0.3524590163934426, + "public_relations": 0.5596330275229358, + "professional_psychology": 0.4877250409165303, + "prehistory": 0.4551083591331269, + "anatomy": 0.44029850746268656, + "human_sexuality": 0.5846153846153846, + "college_medicine": 0.43023255813953487, + "high_school_government_and_politics": 0.640625, + "college_chemistry": 0.3333333333333333, + "logical_fallacies": 0.5679012345679012, + "high_school_geography": 0.6091370558375635, + "elementary_mathematics": 0.363395225464191, + "human_aging": 0.581081081081081, + "college_mathematics": 0.40404040404040403, + "high_school_psychology": 0.6415441176470589, + "formal_logic": 0.344, + "high_school_statistics": 0.29767441860465116, + "international_law": 0.6916666666666667, + "high_school_mathematics": 0.29739776951672864, + "high_school_computer_science": 0.47474747474747475, + "conceptual_physics": 0.45726495726495725, + "miscellaneous": 0.6112531969309463, + "high_school_chemistry": 0.31683168316831684, + "marketing": 0.7639484978540773, + "professional_law": 0.3039791258969341, + "management": 0.6568627450980392, + "college_physics": 0.22772277227722773, + "jurisprudence": 0.5981308411214953, + "world_religions": 0.6588235294117647, + "sociology": 0.695, + "us_foreign_policy": 0.6767676767676768, + "high_school_macroeconomics": 0.442159383033419, + "computer_security": 0.5151515151515151, + "moral_scenarios": 0.24161073825503357, + "moral_disputes": 0.5217391304347826, + "electrical_engineering": 0.5069444444444444, + "astronomy": 0.4900662251655629, + "college_biology": 0.4965034965034965 + } + }, + "prompt_2": { + "accuracy": 0.4519127636753665, + "category_acc": { + "high_school_european_history": 0.3780487804878049, + "business_ethics": 0.6262626262626263, + "clinical_knowledge": 0.553030303030303, + "medical_genetics": 0.5959595959595959, + "high_school_us_history": 0.42857142857142855, + "high_school_physics": 0.26, + "high_school_world_history": 0.4322033898305085, + "virology": 0.46060606060606063, + "high_school_microeconomics": 0.45569620253164556, + "econometrics": 0.25663716814159293, + "college_computer_science": 0.3333333333333333, + "high_school_biology": 0.5598705501618123, + "abstract_algebra": 0.2828282828282828, + "professional_accounting": 0.3202846975088968, + "philosophy": 0.46774193548387094, + "professional_medicine": 0.35424354243542433, + "nutrition": 0.521311475409836, + "global_facts": 0.30303030303030304, + "machine_learning": 0.42342342342342343, + "security_studies": 0.3524590163934426, + "public_relations": 0.5045871559633027, + "professional_psychology": 0.4795417348608838, + "prehistory": 0.4984520123839009, + "anatomy": 0.4925373134328358, + "human_sexuality": 0.5384615384615384, + "college_medicine": 0.436046511627907, + "high_school_government_and_politics": 0.6510416666666666, + "college_chemistry": 0.31313131313131315, + "logical_fallacies": 0.5740740740740741, + "high_school_geography": 0.6243654822335025, + "elementary_mathematics": 0.35809018567639256, + "human_aging": 0.5945945945945946, + "college_mathematics": 0.35353535353535354, + "high_school_psychology": 0.6433823529411765, + "formal_logic": 0.344, + "high_school_statistics": 0.2744186046511628, + "international_law": 0.6666666666666666, + "high_school_mathematics": 0.2862453531598513, + "high_school_computer_science": 0.4444444444444444, + "conceptual_physics": 0.45726495726495725, + "miscellaneous": 0.6278772378516624, + "high_school_chemistry": 0.3316831683168317, + "marketing": 0.7639484978540773, + "professional_law": 0.29810828440965426, + "management": 0.6470588235294118, + "college_physics": 0.27722772277227725, + "jurisprudence": 0.5794392523364486, + "world_religions": 0.6529411764705882, + "sociology": 0.68, + "us_foreign_policy": 0.7070707070707071, + "high_school_macroeconomics": 0.46786632390745503, + "computer_security": 0.5252525252525253, + "moral_scenarios": 0.2371364653243848, + "moral_disputes": 0.5246376811594203, + "electrical_engineering": 0.4791666666666667, + "astronomy": 0.5364238410596026, + "college_biology": 0.46153846153846156 + } + }, + "prompt_3": { + "accuracy": 0.454486950303897, + "category_acc": { + "high_school_european_history": 0.39634146341463417, + "business_ethics": 0.6161616161616161, + "clinical_knowledge": 0.5681818181818182, + "medical_genetics": 0.5656565656565656, + "high_school_us_history": 0.42857142857142855, + "high_school_physics": 0.25333333333333335, + "high_school_world_history": 0.423728813559322, + "virology": 0.49696969696969695, + "high_school_microeconomics": 0.47257383966244726, + "econometrics": 0.3274336283185841, + "college_computer_science": 0.3333333333333333, + "high_school_biology": 0.5631067961165048, + "abstract_algebra": 0.26262626262626265, + "professional_accounting": 0.35231316725978645, + "philosophy": 0.49032258064516127, + "professional_medicine": 0.35424354243542433, + "nutrition": 0.521311475409836, + "global_facts": 0.36363636363636365, + "machine_learning": 0.42342342342342343, + "security_studies": 0.36885245901639346, + "public_relations": 0.5045871559633027, + "professional_psychology": 0.4844517184942717, + "prehistory": 0.4984520123839009, + "anatomy": 0.47761194029850745, + "human_sexuality": 0.5153846153846153, + "college_medicine": 0.4186046511627907, + "high_school_government_and_politics": 0.640625, + "college_chemistry": 0.3333333333333333, + "logical_fallacies": 0.5679012345679012, + "high_school_geography": 0.6243654822335025, + "elementary_mathematics": 0.3448275862068966, + "human_aging": 0.5855855855855856, + "college_mathematics": 0.35353535353535354, + "high_school_psychology": 0.6544117647058824, + "formal_logic": 0.32, + "high_school_statistics": 0.29767441860465116, + "international_law": 0.6666666666666666, + "high_school_mathematics": 0.2862453531598513, + "high_school_computer_science": 0.4444444444444444, + "conceptual_physics": 0.44871794871794873, + "miscellaneous": 0.6278772378516624, + "high_school_chemistry": 0.3415841584158416, + "marketing": 0.7553648068669528, + "professional_law": 0.3039791258969341, + "management": 0.6372549019607843, + "college_physics": 0.3069306930693069, + "jurisprudence": 0.5514018691588785, + "world_religions": 0.6294117647058823, + "sociology": 0.72, + "us_foreign_policy": 0.7070707070707071, + "high_school_macroeconomics": 0.45758354755784064, + "computer_security": 0.5252525252525253, + "moral_scenarios": 0.2348993288590604, + "moral_disputes": 0.5014492753623189, + "electrical_engineering": 0.5, + "astronomy": 0.5165562913907285, + "college_biology": 0.4755244755244755 + } + }, + "prompt_4": { + "accuracy": 0.4616374687164819, + "category_acc": { + "high_school_european_history": 0.4451219512195122, + "business_ethics": 0.5959595959595959, + "clinical_knowledge": 0.5454545454545454, + "medical_genetics": 0.5555555555555556, + "high_school_us_history": 0.4876847290640394, + "high_school_physics": 0.28, + "high_school_world_history": 0.5084745762711864, + "virology": 0.4727272727272727, + "high_school_microeconomics": 0.47257383966244726, + "econometrics": 0.2920353982300885, + "college_computer_science": 0.3333333333333333, + "high_school_biology": 0.5728155339805825, + "abstract_algebra": 0.31313131313131315, + "professional_accounting": 0.38434163701067614, + "philosophy": 0.4838709677419355, + "professional_medicine": 0.3763837638376384, + "nutrition": 0.5540983606557377, + "global_facts": 0.3333333333333333, + "machine_learning": 0.40540540540540543, + "security_studies": 0.39344262295081966, + "public_relations": 0.47706422018348627, + "professional_psychology": 0.4877250409165303, + "prehistory": 0.5170278637770898, + "anatomy": 0.4626865671641791, + "human_sexuality": 0.5923076923076923, + "college_medicine": 0.4127906976744186, + "high_school_government_and_politics": 0.6510416666666666, + "college_chemistry": 0.36363636363636365, + "logical_fallacies": 0.5802469135802469, + "high_school_geography": 0.6446700507614214, + "elementary_mathematics": 0.4005305039787798, + "human_aging": 0.5720720720720721, + "college_mathematics": 0.35353535353535354, + "high_school_psychology": 0.6415441176470589, + "formal_logic": 0.328, + "high_school_statistics": 0.30697674418604654, + "international_law": 0.7, + "high_school_mathematics": 0.275092936802974, + "high_school_computer_science": 0.45454545454545453, + "conceptual_physics": 0.43162393162393164, + "miscellaneous": 0.6150895140664961, + "high_school_chemistry": 0.36633663366336633, + "marketing": 0.7553648068669528, + "professional_law": 0.3020221787345075, + "management": 0.5980392156862745, + "college_physics": 0.22772277227722773, + "jurisprudence": 0.5607476635514018, + "world_religions": 0.6470588235294118, + "sociology": 0.7, + "us_foreign_policy": 0.7272727272727273, + "high_school_macroeconomics": 0.455012853470437, + "computer_security": 0.5656565656565656, + "moral_scenarios": 0.24272930648769575, + "moral_disputes": 0.5304347826086957, + "electrical_engineering": 0.4861111111111111, + "astronomy": 0.5231788079470199, + "college_biology": 0.5104895104895105 + } + }, + "prompt_5": { + "accuracy": 0.46485520200214514, + "category_acc": { + "high_school_european_history": 0.4573170731707317, + "business_ethics": 0.5757575757575758, + "clinical_knowledge": 0.5378787878787878, + "medical_genetics": 0.5757575757575758, + "high_school_us_history": 0.47783251231527096, + "high_school_physics": 0.26, + "high_school_world_history": 0.5084745762711864, + "virology": 0.4484848484848485, + "high_school_microeconomics": 0.48945147679324896, + "econometrics": 0.2831858407079646, + "college_computer_science": 0.3838383838383838, + "high_school_biology": 0.5857605177993528, + "abstract_algebra": 0.29292929292929293, + "professional_accounting": 0.3736654804270463, + "philosophy": 0.4967741935483871, + "professional_medicine": 0.3800738007380074, + "nutrition": 0.5639344262295082, + "global_facts": 0.29292929292929293, + "machine_learning": 0.4144144144144144, + "security_studies": 0.4016393442622951, + "public_relations": 0.46788990825688076, + "professional_psychology": 0.49754500818330605, + "prehistory": 0.5108359133126935, + "anatomy": 0.4626865671641791, + "human_sexuality": 0.6, + "college_medicine": 0.4011627906976744, + "high_school_government_and_politics": 0.6770833333333334, + "college_chemistry": 0.36363636363636365, + "logical_fallacies": 0.6049382716049383, + "high_school_geography": 0.6395939086294417, + "elementary_mathematics": 0.3819628647214854, + "human_aging": 0.581081081081081, + "college_mathematics": 0.3838383838383838, + "high_school_psychology": 0.6617647058823529, + "formal_logic": 0.304, + "high_school_statistics": 0.2930232558139535, + "international_law": 0.6666666666666666, + "high_school_mathematics": 0.2527881040892193, + "high_school_computer_science": 0.46464646464646464, + "conceptual_physics": 0.4444444444444444, + "miscellaneous": 0.6368286445012787, + "high_school_chemistry": 0.32673267326732675, + "marketing": 0.7467811158798283, + "professional_law": 0.30919765166340507, + "management": 0.6274509803921569, + "college_physics": 0.25742574257425743, + "jurisprudence": 0.616822429906542, + "world_religions": 0.6647058823529411, + "sociology": 0.705, + "us_foreign_policy": 0.7070707070707071, + "high_school_macroeconomics": 0.46786632390745503, + "computer_security": 0.494949494949495, + "moral_scenarios": 0.23825503355704697, + "moral_disputes": 0.5362318840579711, + "electrical_engineering": 0.5069444444444444, + "astronomy": 0.5231788079470199, + "college_biology": 0.5244755244755245 + } + } }, "c_eval": { - "prompt_1": -1 - }, - "c_eval_full": { - "prompt_1": -1 - }, - "cmmlu": { - "prompt_1": -1 - }, - "cmmlu_full": { - "prompt_1": -1 - }, - "zbench": { - "prompt_1": -1 - }, - "ind_emotion": { - "prompt_1": -1 - }, - "ocnli": { - "prompt_1": -1 - }, - "c3": { - "prompt_1": -1 - }, - "dream": { - "prompt_1": -1 - }, - "samsum": { - "prompt_1": -1 - }, - "dialogsum": { - "prompt_1": -1 - }, - "sst2": { - "prompt_1": -1 - }, - "cola": { - "prompt_1": -1 - }, - "qqp": { - "prompt_1": -1 - }, - "mnli": { - "prompt_1": -1 - }, - "qnli": { - "prompt_1": -1 - }, - "wnli": { - "prompt_1": -1 - }, - "rte": { - "prompt_1": -1 - }, - "mrpc": { - "prompt_1": -1 - } - } - }, - "phi-2": { - "model_size": "2.7B", - "model_link": "https://huggingface.co/microsoft/phi-2", - "zero_shot": { - "cross_mmlu": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.2689450222882615 + }, + "prompt_2": { + "accuracy": 0.25928677563150077 + }, + "prompt_3": { + "accuracy": 0.2578008915304606 + }, + "prompt_4": { + "accuracy": 0.2674591381872214 + }, + "prompt_5": { + "accuracy": 0.2637444279346211 + } + }, + "c_eval_full": { + "prompt_1": { + "accuracy": 0.26089663760896636, + "category_acc": { + "computer_network": 0.2916666666666667, + "operating_system": 0.25, + "computer_architecture": 0.46153846153846156, + "college_programming": 0.42857142857142855, + "college_physics": 0.2916666666666667, + "college_chemistry": 0.1724137931034483, + "advanced_mathematics": 0.3333333333333333, + "probability_and_statistics": 0.2608695652173913, + "discrete_mathematics": 0.3333333333333333, + "electrical_engineer": 0.2619047619047619, + "metrology_engineer": 0.10344827586206896, + "high_school_mathematics": 0.2608695652173913, + "high_school_physics": 0.25, + "high_school_chemistry": 0.125, + "high_school_biology": 0.25, + "middle_school_mathematics": 0.20833333333333334, + "middle_school_biology": 0.15384615384615385, + "middle_school_physics": 0.16666666666666666, + "middle_school_chemistry": 0.16, + "veterinary_medicine": 0.25, + "college_economics": 0.36666666666666664, + "business_administration": 0.18421052631578946, + "marxism": 0.20833333333333334, + "mao_zedong_thought": 0.3103448275862069, + "education_science": 0.35294117647058826, + "teacher_qualification": 0.2653061224489796, + "high_school_politics": 0.2916666666666667, + "high_school_geography": 0.3333333333333333, + "middle_school_politics": 0.34615384615384615, + "middle_school_geography": 0.11764705882352941, + "modern_chinese_history": 0.17857142857142858, + "ideological_and_moral_cultivation": 0.4166666666666667, + "logic": 0.25925925925925924, + "law": 0.2413793103448276, + "chinese_language_and_literature": 0.2857142857142857, + "art_studies": 0.3684210526315789, + "professional_tour_guide": 0.35294117647058826, + "legal_professional": 0.21428571428571427, + "high_school_chinese": 0.2916666666666667, + "high_school_history": 0.32, + "middle_school_history": 0.2222222222222222, + "civil_servant": 0.25, + "sports_science": 0.25, + "plant_protection": 0.37037037037037035, + "basic_medicine": 0.20833333333333334, + "clinical_medicine": 0.37037037037037035, + "urban_and_rural_planner": 0.21568627450980393, + "accountant": 0.2222222222222222, + "fire_engineer": 0.2222222222222222, + "environmental_impact_assessment_engineer": 0.19444444444444445, + "tax_accountant": 0.12962962962962962, + "physician": 0.2037037037037037 + } + }, + "prompt_2": { + "accuracy": 0.24906600249066002, + "category_acc": { + "computer_network": 0.3333333333333333, + "operating_system": 0.25, + "computer_architecture": 0.34615384615384615, + "college_programming": 0.30952380952380953, + "college_physics": 0.2916666666666667, + "college_chemistry": 0.20689655172413793, + "advanced_mathematics": 0.20833333333333334, + "probability_and_statistics": 0.17391304347826086, + "discrete_mathematics": 0.38095238095238093, + "electrical_engineer": 0.2619047619047619, + "metrology_engineer": 0.1724137931034483, + "high_school_mathematics": 0.34782608695652173, + "high_school_physics": 0.16666666666666666, + "high_school_chemistry": 0.125, + "high_school_biology": 0.20833333333333334, + "middle_school_mathematics": 0.2916666666666667, + "middle_school_biology": 0.19230769230769232, + "middle_school_physics": 0.20833333333333334, + "middle_school_chemistry": 0.16, + "veterinary_medicine": 0.25, + "college_economics": 0.35, + "business_administration": 0.15789473684210525, + "marxism": 0.3333333333333333, + "mao_zedong_thought": 0.3103448275862069, + "education_science": 0.2647058823529412, + "teacher_qualification": 0.24489795918367346, + "high_school_politics": 0.25, + "high_school_geography": 0.2916666666666667, + "middle_school_politics": 0.3076923076923077, + "middle_school_geography": 0.11764705882352941, + "modern_chinese_history": 0.10714285714285714, + "ideological_and_moral_cultivation": 0.25, + "logic": 0.2962962962962963, + "law": 0.2413793103448276, + "chinese_language_and_literature": 0.25, + "art_studies": 0.34210526315789475, + "professional_tour_guide": 0.3235294117647059, + "legal_professional": 0.21428571428571427, + "high_school_chinese": 0.20833333333333334, + "high_school_history": 0.32, + "middle_school_history": 0.2962962962962963, + "civil_servant": 0.3076923076923077, + "sports_science": 0.20833333333333334, + "plant_protection": 0.2962962962962963, + "basic_medicine": 0.16666666666666666, + "clinical_medicine": 0.2962962962962963, + "urban_and_rural_planner": 0.21568627450980393, + "accountant": 0.2222222222222222, + "fire_engineer": 0.25, + "environmental_impact_assessment_engineer": 0.16666666666666666, + "tax_accountant": 0.16666666666666666, + "physician": 0.2222222222222222 + } + }, + "prompt_3": { + "accuracy": 0.2459526774595268, + "category_acc": { + "computer_network": 0.25, + "operating_system": 0.25, + "computer_architecture": 0.3076923076923077, + "college_programming": 0.2857142857142857, + "college_physics": 0.20833333333333334, + "college_chemistry": 0.2413793103448276, + "advanced_mathematics": 0.125, + "probability_and_statistics": 0.17391304347826086, + "discrete_mathematics": 0.3333333333333333, + "electrical_engineer": 0.23809523809523808, + "metrology_engineer": 0.13793103448275862, + "high_school_mathematics": 0.34782608695652173, + "high_school_physics": 0.16666666666666666, + "high_school_chemistry": 0.08333333333333333, + "high_school_biology": 0.25, + "middle_school_mathematics": 0.20833333333333334, + "middle_school_biology": 0.19230769230769232, + "middle_school_physics": 0.2916666666666667, + "middle_school_chemistry": 0.2, + "veterinary_medicine": 0.25, + "college_economics": 0.35, + "business_administration": 0.15789473684210525, + "marxism": 0.3333333333333333, + "mao_zedong_thought": 0.41379310344827586, + "education_science": 0.23529411764705882, + "teacher_qualification": 0.2653061224489796, + "high_school_politics": 0.20833333333333334, + "high_school_geography": 0.20833333333333334, + "middle_school_politics": 0.3076923076923077, + "middle_school_geography": 0.29411764705882354, + "modern_chinese_history": 0.17857142857142858, + "ideological_and_moral_cultivation": 0.2916666666666667, + "logic": 0.25925925925925924, + "law": 0.13793103448275862, + "chinese_language_and_literature": 0.25, + "art_studies": 0.3684210526315789, + "professional_tour_guide": 0.35294117647058826, + "legal_professional": 0.17857142857142858, + "high_school_chinese": 0.20833333333333334, + "high_school_history": 0.32, + "middle_school_history": 0.3333333333333333, + "civil_servant": 0.28846153846153844, + "sports_science": 0.20833333333333334, + "plant_protection": 0.3333333333333333, + "basic_medicine": 0.20833333333333334, + "clinical_medicine": 0.25925925925925924, + "urban_and_rural_planner": 0.19607843137254902, + "accountant": 0.2222222222222222, + "fire_engineer": 0.25, + "environmental_impact_assessment_engineer": 0.16666666666666666, + "tax_accountant": 0.16666666666666666, + "physician": 0.24074074074074073 + } + }, + "prompt_4": { + "accuracy": 0.24346201743462018, + "category_acc": { + "computer_network": 0.2916666666666667, + "operating_system": 0.20833333333333334, + "computer_architecture": 0.38461538461538464, + "college_programming": 0.35714285714285715, + "college_physics": 0.20833333333333334, + "college_chemistry": 0.3103448275862069, + "advanced_mathematics": 0.25, + "probability_and_statistics": 0.21739130434782608, + "discrete_mathematics": 0.2857142857142857, + "electrical_engineer": 0.23809523809523808, + "metrology_engineer": 0.1724137931034483, + "high_school_mathematics": 0.30434782608695654, + "high_school_physics": 0.16666666666666666, + "high_school_chemistry": 0.16666666666666666, + "high_school_biology": 0.2916666666666667, + "middle_school_mathematics": 0.16666666666666666, + "middle_school_biology": 0.15384615384615385, + "middle_school_physics": 0.20833333333333334, + "middle_school_chemistry": 0.16, + "veterinary_medicine": 0.14285714285714285, + "college_economics": 0.35, + "business_administration": 0.13157894736842105, + "marxism": 0.25, + "mao_zedong_thought": 0.3103448275862069, + "education_science": 0.29411764705882354, + "teacher_qualification": 0.2857142857142857, + "high_school_politics": 0.4583333333333333, + "high_school_geography": 0.25, + "middle_school_politics": 0.34615384615384615, + "middle_school_geography": 0.11764705882352941, + "modern_chinese_history": 0.21428571428571427, + "ideological_and_moral_cultivation": 0.25, + "logic": 0.2222222222222222, + "law": 0.2413793103448276, + "chinese_language_and_literature": 0.14285714285714285, + "art_studies": 0.3684210526315789, + "professional_tour_guide": 0.38235294117647056, + "legal_professional": 0.21428571428571427, + "high_school_chinese": 0.25, + "high_school_history": 0.24, + "middle_school_history": 0.25925925925925924, + "civil_servant": 0.23076923076923078, + "sports_science": 0.16666666666666666, + "plant_protection": 0.3333333333333333, + "basic_medicine": 0.125, + "clinical_medicine": 0.25925925925925924, + "urban_and_rural_planner": 0.21568627450980393, + "accountant": 0.18518518518518517, + "fire_engineer": 0.19444444444444445, + "environmental_impact_assessment_engineer": 0.2222222222222222, + "tax_accountant": 0.14814814814814814, + "physician": 0.2222222222222222 + } + }, + "prompt_5": { + "accuracy": 0.25404732254047324, + "category_acc": { + "computer_network": 0.2916666666666667, + "operating_system": 0.25, + "computer_architecture": 0.4230769230769231, + "college_programming": 0.38095238095238093, + "college_physics": 0.25, + "college_chemistry": 0.13793103448275862, + "advanced_mathematics": 0.20833333333333334, + "probability_and_statistics": 0.2608695652173913, + "discrete_mathematics": 0.38095238095238093, + "electrical_engineer": 0.23809523809523808, + "metrology_engineer": 0.13793103448275862, + "high_school_mathematics": 0.21739130434782608, + "high_school_physics": 0.25, + "high_school_chemistry": 0.125, + "high_school_biology": 0.25, + "middle_school_mathematics": 0.20833333333333334, + "middle_school_biology": 0.15384615384615385, + "middle_school_physics": 0.125, + "middle_school_chemistry": 0.16, + "veterinary_medicine": 0.25, + "college_economics": 0.36666666666666664, + "business_administration": 0.18421052631578946, + "marxism": 0.2916666666666667, + "mao_zedong_thought": 0.2413793103448276, + "education_science": 0.23529411764705882, + "teacher_qualification": 0.30612244897959184, + "high_school_politics": 0.4166666666666667, + "high_school_geography": 0.3333333333333333, + "middle_school_politics": 0.38461538461538464, + "middle_school_geography": 0.11764705882352941, + "modern_chinese_history": 0.21428571428571427, + "ideological_and_moral_cultivation": 0.375, + "logic": 0.25925925925925924, + "law": 0.2413793103448276, + "chinese_language_and_literature": 0.25, + "art_studies": 0.34210526315789475, + "professional_tour_guide": 0.38235294117647056, + "legal_professional": 0.17857142857142858, + "high_school_chinese": 0.25, + "high_school_history": 0.36, + "middle_school_history": 0.2962962962962963, + "civil_servant": 0.21153846153846154, + "sports_science": 0.20833333333333334, + "plant_protection": 0.4074074074074074, + "basic_medicine": 0.125, + "clinical_medicine": 0.2962962962962963, + "urban_and_rural_planner": 0.23529411764705882, + "accountant": 0.2037037037037037, + "fire_engineer": 0.2222222222222222, + "environmental_impact_assessment_engineer": 0.2222222222222222, + "tax_accountant": 0.12962962962962962, + "physician": 0.2222222222222222 + } + } }, - "cross_logiqa": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "cmmlu": { + "prompt_1": { + "accuracy": 0.25806451612903225 + }, + "prompt_2": { + "accuracy": 0.24731182795698925 + }, + "prompt_3": { + "accuracy": 0.2724014336917563 + }, + "prompt_4": { + "accuracy": 0.27956989247311825 + }, + "prompt_5": { + "accuracy": 0.2616487455197133 + } }, - "sg_eval": { + "cmmlu_full": { "prompt_1": { - "accuracy": 0.46601941747572817 + "accuracy": 0.2672250043170437, + "category_acc": { + "agronomy": 0.24260355029585798, + "anatomy": 0.25675675675675674, + "ancient_chinese": 0.2682926829268293, + "arts": 0.26875, + "astronomy": 0.2727272727272727, + "business_ethics": 0.27751196172248804, + "chinese_civil_service_exam": 0.23125, + "chinese_driving_rule": 0.31297709923664124, + "chinese_food_culture": 0.2426470588235294, + "chinese_foreign_policy": 0.19626168224299065, + "chinese_history": 0.23839009287925697, + "chinese_literature": 0.24019607843137256, + "chinese_teacher_qualification": 0.25139664804469275, + "clinical_knowledge": 0.2489451476793249, + "college_actuarial_science": 0.25471698113207547, + "college_education": 0.3364485981308411, + "college_engineering_hydrology": 0.3018867924528302, + "college_law": 0.2037037037037037, + "college_mathematics": 0.2571428571428571, + "college_medical_statistics": 0.330188679245283, + "college_medicine": 0.2893772893772894, + "computer_science": 0.29901960784313725, + "computer_security": 0.23976608187134502, + "conceptual_physics": 0.272108843537415, + "construction_project_management": 0.28776978417266186, + "economics": 0.27672955974842767, + "education": 0.26993865030674846, + "electrical_engineering": 0.27906976744186046, + "elementary_chinese": 0.32936507936507936, + "elementary_commonsense": 0.2777777777777778, + "elementary_information_and_technology": 0.2647058823529412, + "elementary_mathematics": 0.26521739130434785, + "ethnology": 0.3111111111111111, + "food_science": 0.25874125874125875, + "genetics": 0.25, + "global_facts": 0.2080536912751678, + "high_school_biology": 0.25443786982248523, + "high_school_chemistry": 0.24242424242424243, + "high_school_geography": 0.2627118644067797, + "high_school_mathematics": 0.18292682926829268, + "high_school_physics": 0.2727272727272727, + "high_school_politics": 0.27972027972027974, + "human_sexuality": 0.23809523809523808, + "international_law": 0.2756756756756757, + "journalism": 0.27325581395348836, + "jurisprudence": 0.2798053527980535, + "legal_and_moral_basis": 0.308411214953271, + "logical": 0.2764227642276423, + "machine_learning": 0.22131147540983606, + "management": 0.2619047619047619, + "marketing": 0.31666666666666665, + "marxist_theory": 0.2857142857142857, + "modern_chinese": 0.25862068965517243, + "nutrition": 0.3448275862068966, + "philosophy": 0.34285714285714286, + "professional_accounting": 0.28, + "professional_law": 0.25118483412322273, + "professional_medicine": 0.24468085106382978, + "professional_psychology": 0.2629310344827586, + "public_relations": 0.25287356321839083, + "security_study": 0.23703703703703705, + "sociology": 0.26548672566371684, + "sports_science": 0.296969696969697, + "traditional_chinese_medicine": 0.22702702702702704, + "virology": 0.2603550295857988, + "world_history": 0.2857142857142857, + "world_religions": 0.2625 + } + }, + "prompt_2": { + "accuracy": 0.25884993956138835, + "category_acc": { + "agronomy": 0.23076923076923078, + "anatomy": 0.22972972972972974, + "ancient_chinese": 0.25609756097560976, + "arts": 0.2125, + "astronomy": 0.24848484848484848, + "business_ethics": 0.23923444976076555, + "chinese_civil_service_exam": 0.2, + "chinese_driving_rule": 0.29770992366412213, + "chinese_food_culture": 0.19117647058823528, + "chinese_foreign_policy": 0.27102803738317754, + "chinese_history": 0.25386996904024767, + "chinese_literature": 0.25, + "chinese_teacher_qualification": 0.25139664804469275, + "clinical_knowledge": 0.25316455696202533, + "college_actuarial_science": 0.2169811320754717, + "college_education": 0.35514018691588783, + "college_engineering_hydrology": 0.32075471698113206, + "college_law": 0.16666666666666666, + "college_mathematics": 0.2571428571428571, + "college_medical_statistics": 0.3018867924528302, + "college_medicine": 0.2600732600732601, + "computer_science": 0.3088235294117647, + "computer_security": 0.26900584795321636, + "conceptual_physics": 0.2857142857142857, + "construction_project_management": 0.30935251798561153, + "economics": 0.27044025157232704, + "education": 0.24539877300613497, + "electrical_engineering": 0.27325581395348836, + "elementary_chinese": 0.2857142857142857, + "elementary_commonsense": 0.2474747474747475, + "elementary_information_and_technology": 0.28991596638655465, + "elementary_mathematics": 0.27391304347826084, + "ethnology": 0.24444444444444444, + "food_science": 0.23776223776223776, + "genetics": 0.24431818181818182, + "global_facts": 0.2080536912751678, + "high_school_biology": 0.2603550295857988, + "high_school_chemistry": 0.25, + "high_school_geography": 0.2627118644067797, + "high_school_mathematics": 0.20121951219512196, + "high_school_physics": 0.2727272727272727, + "high_school_politics": 0.26573426573426573, + "human_sexuality": 0.2777777777777778, + "international_law": 0.24864864864864866, + "journalism": 0.29069767441860467, + "jurisprudence": 0.25790754257907544, + "legal_and_moral_basis": 0.2897196261682243, + "logical": 0.2682926829268293, + "machine_learning": 0.27049180327868855, + "management": 0.2619047619047619, + "marketing": 0.31666666666666665, + "marxist_theory": 0.2857142857142857, + "modern_chinese": 0.23275862068965517, + "nutrition": 0.2896551724137931, + "philosophy": 0.29523809523809524, + "professional_accounting": 0.28, + "professional_law": 0.22274881516587677, + "professional_medicine": 0.24202127659574468, + "professional_psychology": 0.22844827586206898, + "public_relations": 0.25862068965517243, + "security_study": 0.26666666666666666, + "sociology": 0.252212389380531, + "sports_science": 0.26666666666666666, + "traditional_chinese_medicine": 0.23243243243243245, + "virology": 0.28402366863905326, + "world_history": 0.2857142857142857, + "world_religions": 0.2125 + } + }, + "prompt_3": { + "accuracy": 0.2623899153859437, + "category_acc": { + "agronomy": 0.25443786982248523, + "anatomy": 0.28378378378378377, + "ancient_chinese": 0.24390243902439024, + "arts": 0.24375, + "astronomy": 0.24848484848484848, + "business_ethics": 0.23923444976076555, + "chinese_civil_service_exam": 0.23125, + "chinese_driving_rule": 0.31297709923664124, + "chinese_food_culture": 0.19117647058823528, + "chinese_foreign_policy": 0.27102803738317754, + "chinese_history": 0.26006191950464397, + "chinese_literature": 0.2647058823529412, + "chinese_teacher_qualification": 0.2681564245810056, + "clinical_knowledge": 0.24472573839662448, + "college_actuarial_science": 0.22641509433962265, + "college_education": 0.3177570093457944, + "college_engineering_hydrology": 0.27358490566037735, + "college_law": 0.2037037037037037, + "college_mathematics": 0.26666666666666666, + "college_medical_statistics": 0.29245283018867924, + "college_medicine": 0.26373626373626374, + "computer_science": 0.30392156862745096, + "computer_security": 0.26900584795321636, + "conceptual_physics": 0.2585034013605442, + "construction_project_management": 0.2733812949640288, + "economics": 0.27672955974842767, + "education": 0.25153374233128833, + "electrical_engineering": 0.3081395348837209, + "elementary_chinese": 0.3055555555555556, + "elementary_commonsense": 0.2222222222222222, + "elementary_information_and_technology": 0.2857142857142857, + "elementary_mathematics": 0.25217391304347825, + "ethnology": 0.2740740740740741, + "food_science": 0.2867132867132867, + "genetics": 0.23863636363636365, + "global_facts": 0.2483221476510067, + "high_school_biology": 0.24260355029585798, + "high_school_chemistry": 0.22727272727272727, + "high_school_geography": 0.2542372881355932, + "high_school_mathematics": 0.18902439024390244, + "high_school_physics": 0.2818181818181818, + "high_school_politics": 0.21678321678321677, + "human_sexuality": 0.29365079365079366, + "international_law": 0.2864864864864865, + "journalism": 0.3023255813953488, + "jurisprudence": 0.2773722627737226, + "legal_and_moral_basis": 0.2803738317757009, + "logical": 0.2682926829268293, + "machine_learning": 0.2786885245901639, + "management": 0.2619047619047619, + "marketing": 0.2722222222222222, + "marxist_theory": 0.2698412698412698, + "modern_chinese": 0.2672413793103448, + "nutrition": 0.2896551724137931, + "philosophy": 0.3238095238095238, + "professional_accounting": 0.25142857142857145, + "professional_law": 0.22274881516587677, + "professional_medicine": 0.26595744680851063, + "professional_psychology": 0.24568965517241378, + "public_relations": 0.2413793103448276, + "security_study": 0.26666666666666666, + "sociology": 0.2610619469026549, + "sports_science": 0.28484848484848485, + "traditional_chinese_medicine": 0.21621621621621623, + "virology": 0.28402366863905326, + "world_history": 0.2732919254658385, + "world_religions": 0.2375 + } }, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "cn_eval": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "us_eval": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "ph_eval": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "sing2eng": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "flores_ind2eng": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "flores_vie2eng": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "flores_zho2eng": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "flores_zsm2eng": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "mmlu": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "mmlu_full": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "c_eval": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "c_eval_full": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "cmmlu": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "cmmlu_full": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 - }, - "zbench": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_4": { + "accuracy": 0.26221723363840443, + "category_acc": { + "agronomy": 0.3136094674556213, + "anatomy": 0.2702702702702703, + "ancient_chinese": 0.2621951219512195, + "arts": 0.2625, + "astronomy": 0.24848484848484848, + "business_ethics": 0.2583732057416268, + "chinese_civil_service_exam": 0.25, + "chinese_driving_rule": 0.31297709923664124, + "chinese_food_culture": 0.2426470588235294, + "chinese_foreign_policy": 0.2336448598130841, + "chinese_history": 0.23219814241486067, + "chinese_literature": 0.24019607843137256, + "chinese_teacher_qualification": 0.26256983240223464, + "clinical_knowledge": 0.26582278481012656, + "college_actuarial_science": 0.24528301886792453, + "college_education": 0.308411214953271, + "college_engineering_hydrology": 0.2641509433962264, + "college_law": 0.23148148148148148, + "college_mathematics": 0.24761904761904763, + "college_medical_statistics": 0.3018867924528302, + "college_medicine": 0.304029304029304, + "computer_science": 0.25, + "computer_security": 0.23976608187134502, + "conceptual_physics": 0.272108843537415, + "construction_project_management": 0.302158273381295, + "economics": 0.27672955974842767, + "education": 0.25766871165644173, + "electrical_engineering": 0.27906976744186046, + "elementary_chinese": 0.2896825396825397, + "elementary_commonsense": 0.23232323232323232, + "elementary_information_and_technology": 0.27310924369747897, + "elementary_mathematics": 0.27391304347826084, + "ethnology": 0.26666666666666666, + "food_science": 0.27972027972027974, + "genetics": 0.25, + "global_facts": 0.21476510067114093, + "high_school_biology": 0.25443786982248523, + "high_school_chemistry": 0.22727272727272727, + "high_school_geography": 0.2457627118644068, + "high_school_mathematics": 0.1951219512195122, + "high_school_physics": 0.2545454545454545, + "high_school_politics": 0.25874125874125875, + "human_sexuality": 0.25396825396825395, + "international_law": 0.2810810810810811, + "journalism": 0.25, + "jurisprudence": 0.2846715328467153, + "legal_and_moral_basis": 0.2897196261682243, + "logical": 0.3089430894308943, + "machine_learning": 0.22950819672131148, + "management": 0.24761904761904763, + "marketing": 0.3055555555555556, + "marxist_theory": 0.2962962962962963, + "modern_chinese": 0.23275862068965517, + "nutrition": 0.2827586206896552, + "philosophy": 0.3238095238095238, + "professional_accounting": 0.2742857142857143, + "professional_law": 0.24170616113744076, + "professional_medicine": 0.23404255319148937, + "professional_psychology": 0.24568965517241378, + "public_relations": 0.25862068965517243, + "security_study": 0.2518518518518518, + "sociology": 0.28761061946902655, + "sports_science": 0.2545454545454545, + "traditional_chinese_medicine": 0.23243243243243245, + "virology": 0.28402366863905326, + "world_history": 0.2236024844720497, + "world_religions": 0.2375 + } + }, + "prompt_5": { + "accuracy": 0.2657572094629598, + "category_acc": { + "agronomy": 0.24260355029585798, + "anatomy": 0.2972972972972973, + "ancient_chinese": 0.2865853658536585, + "arts": 0.25625, + "astronomy": 0.2787878787878788, + "business_ethics": 0.23923444976076555, + "chinese_civil_service_exam": 0.21875, + "chinese_driving_rule": 0.29770992366412213, + "chinese_food_culture": 0.22794117647058823, + "chinese_foreign_policy": 0.2616822429906542, + "chinese_history": 0.23219814241486067, + "chinese_literature": 0.25980392156862747, + "chinese_teacher_qualification": 0.25139664804469275, + "clinical_knowledge": 0.2616033755274262, + "college_actuarial_science": 0.25471698113207547, + "college_education": 0.3177570093457944, + "college_engineering_hydrology": 0.2641509433962264, + "college_law": 0.2037037037037037, + "college_mathematics": 0.21904761904761905, + "college_medical_statistics": 0.3113207547169811, + "college_medicine": 0.28205128205128205, + "computer_science": 0.30392156862745096, + "computer_security": 0.24561403508771928, + "conceptual_physics": 0.272108843537415, + "construction_project_management": 0.33093525179856115, + "economics": 0.3081761006289308, + "education": 0.26380368098159507, + "electrical_engineering": 0.2616279069767442, + "elementary_chinese": 0.3055555555555556, + "elementary_commonsense": 0.2676767676767677, + "elementary_information_and_technology": 0.2689075630252101, + "elementary_mathematics": 0.25217391304347825, + "ethnology": 0.25925925925925924, + "food_science": 0.3146853146853147, + "genetics": 0.2215909090909091, + "global_facts": 0.21476510067114093, + "high_school_biology": 0.27218934911242604, + "high_school_chemistry": 0.24242424242424243, + "high_school_geography": 0.2457627118644068, + "high_school_mathematics": 0.2073170731707317, + "high_school_physics": 0.2636363636363636, + "high_school_politics": 0.2937062937062937, + "human_sexuality": 0.2222222222222222, + "international_law": 0.3027027027027027, + "journalism": 0.3023255813953488, + "jurisprudence": 0.2846715328467153, + "legal_and_moral_basis": 0.308411214953271, + "logical": 0.2682926829268293, + "machine_learning": 0.21311475409836064, + "management": 0.24761904761904763, + "marketing": 0.29444444444444445, + "marxist_theory": 0.2698412698412698, + "modern_chinese": 0.22413793103448276, + "nutrition": 0.32413793103448274, + "philosophy": 0.3047619047619048, + "professional_accounting": 0.2914285714285714, + "professional_law": 0.26066350710900477, + "professional_medicine": 0.26063829787234044, + "professional_psychology": 0.28448275862068967, + "public_relations": 0.2413793103448276, + "security_study": 0.23703703703703705, + "sociology": 0.24336283185840707, + "sports_science": 0.2545454545454545, + "traditional_chinese_medicine": 0.24864864864864866, + "virology": 0.2603550295857988, + "world_history": 0.2795031055900621, + "world_religions": 0.25 + } + } + }, + "zbench": { + "prompt_1": { + "accuracy": 0.15151515151515152 + }, + "prompt_2": { + "accuracy": 0.09090909090909091 + }, + "prompt_3": { + "accuracy": 0.09090909090909091 + }, + "prompt_4": { + "accuracy": 0.12121212121212122 + }, + "prompt_5": { + "accuracy": 0.12121212121212122 + } }, "ind_emotion": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.125 + }, + "prompt_2": { + "accuracy": 0.12272727272727273 + }, + "prompt_3": { + "accuracy": 0.1409090909090909 + }, + "prompt_4": { + "accuracy": 0.2818181818181818 + }, + "prompt_5": { + "accuracy": 0.29772727272727273 + } }, "ocnli": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.3376271186440678 + }, + "prompt_2": { + "accuracy": 0.33220338983050846 + }, + "prompt_3": { + "accuracy": 0.33661016949152545 + }, + "prompt_4": { + "accuracy": 0.3494915254237288 + }, + "prompt_5": { + "accuracy": 0.3403389830508475 + } }, "c3": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.34741959611069556 + }, + "prompt_2": { + "accuracy": 0.3257292445774121 + }, + "prompt_3": { + "accuracy": 0.3182498130142109 + }, + "prompt_4": { + "accuracy": 0.35789080029917725 + }, + "prompt_5": { + "accuracy": 0.3365744203440538 + } }, "dream": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.5428711415972562 + }, + "prompt_2": { + "accuracy": 0.5433610975012249 + }, + "prompt_3": { + "accuracy": 0.5276825085742283 + }, + "prompt_4": { + "accuracy": 0.5360117589416953 + }, + "prompt_5": { + "accuracy": 0.540421362077413 + } }, "samsum": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "rouge1": 0.1834738916315244, + "rouge2": 0.05665372281442321, + "rougeL": 0.14171060377368988, + "avg_rouge": 0.1272794060732125 + }, + "prompt_2": { + "rouge1": 0.2032867878554549, + "rouge2": 0.06188850672720403, + "rougeL": 0.1558551917670449, + "avg_rouge": 0.1403434954499013 + }, + "prompt_3": { + "rouge1": 0.17331038811668037, + "rouge2": 0.05045380981171919, + "rougeL": 0.1330873797403758, + "avg_rouge": 0.11895052588959178 + }, + "prompt_4": { + "rouge1": 0.18607934494101233, + "rouge2": 0.056130347817802406, + "rougeL": 0.14212340633137646, + "avg_rouge": 0.12811103303006374 + }, + "prompt_5": { + "rouge1": 0.15816617371583883, + "rouge2": 0.04816674239806521, + "rougeL": 0.12182373964697867, + "avg_rouge": 0.10938555192029424 + } }, "dialogsum": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "rouge1": 0.1617876971461384, + "rouge2": 0.03844699667915495, + "rougeL": 0.12295372467530369, + "avg_rouge": 0.10772947283353235 + }, + "prompt_2": { + "rouge1": 0.09796881476593486, + "rouge2": 0.022814276789812054, + "rougeL": 0.07132365775852015, + "avg_rouge": 0.0640355831047557 + }, + "prompt_3": { + "rouge1": 0.10265966264632512, + "rouge2": 0.025227346610254563, + "rougeL": 0.07437479368269946, + "avg_rouge": 0.06742060097975971 + }, + "prompt_4": { + "rouge1": 0.16375650195715208, + "rouge2": 0.04081621800175042, + "rougeL": 0.12385082995690865, + "avg_rouge": 0.1094745166386037 + }, + "prompt_5": { + "rouge1": 0.14604445803222393, + "rouge2": 0.03870558262534011, + "rougeL": 0.1124706671846648, + "avg_rouge": 0.09907356928074294 + } }, "sst2": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.856651376146789 + }, + "prompt_2": { + "accuracy": 0.8715596330275229 + }, + "prompt_3": { + "accuracy": 0.8532110091743119 + }, + "prompt_4": { + "accuracy": 0.8669724770642202 + }, + "prompt_5": { + "accuracy": 0.8474770642201835 + } }, "cola": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.5206136145733461 + }, + "prompt_2": { + "accuracy": 0.5417066155321189 + }, + "prompt_3": { + "accuracy": 0.5714285714285714 + }, + "prompt_4": { + "accuracy": 0.5848513902205177 + }, + "prompt_5": { + "accuracy": 0.5263662511984659 + } }, "qqp": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.5815 + }, + "prompt_2": { + "accuracy": 0.5745 + }, + "prompt_3": { + "accuracy": 0.5585 + }, + "prompt_4": { + "accuracy": 0.5395 + }, + "prompt_5": { + "accuracy": 0.549 + } }, "mnli": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.4275 + }, + "prompt_2": { + "accuracy": 0.421 + }, + "prompt_3": { + "accuracy": 0.382 + }, + "prompt_4": { + "accuracy": 0.4205 + }, + "prompt_5": { + "accuracy": 0.418 + } }, "qnli": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.532 + }, + "prompt_2": { + "accuracy": 0.5165 + }, + "prompt_3": { + "accuracy": 0.5175 + }, + "prompt_4": { + "accuracy": 0.521 + }, + "prompt_5": { + "accuracy": 0.518 + } }, "wnli": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.49295774647887325 + }, + "prompt_2": { + "accuracy": 0.43661971830985913 + }, + "prompt_3": { + "accuracy": 0.4225352112676056 + }, + "prompt_4": { + "accuracy": 0.4225352112676056 + }, + "prompt_5": { + "accuracy": 0.43661971830985913 + } }, "rte": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.5379061371841155 + }, + "prompt_2": { + "accuracy": 0.5451263537906137 + }, + "prompt_3": { + "accuracy": 0.5631768953068592 + }, + "prompt_4": { + "accuracy": 0.5342960288808665 + }, + "prompt_5": { + "accuracy": 0.555956678700361 + } }, "mrpc": { - "prompt_1": -1, - "prompt_2": -1, - "prompt_3": -1, - "prompt_4": -1, - "prompt_5": -1 + "prompt_1": { + "accuracy": 0.5857843137254902 + }, + "prompt_2": { + "accuracy": 0.5661764705882353 + }, + "prompt_3": { + "accuracy": 0.6225490196078431 + }, + "prompt_4": { + "accuracy": 0.553921568627451 + }, + "prompt_5": { + "accuracy": 0.5049019607843137 + } } }, "five_shot": {