{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os, json" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namezero_nativezero_self_conzero_cotzero_cot_self_confew_nativefew_self_confew_cotfew_cot_self_con
0Baichuan-13B-Chat18.320.428.63724.126.718.20000017.800000
1Chinese-Alpaca-2-13B37.737.749.749.748.648.650.50000050.500000
2GPT-3.5-turbo66.666.869.67268.368.370.90000072.500000
3LLaMA-2-13B41.846.553.158.753.35356.80000061.000000
4Qwen-7B-Chat45.94647.350.152.15148.30000049.800000
5ChatGLM2-6B24.824.736.636.537.637.640.50000040.500000
6Chinese-LLaMA-2-13B29.429.437.837.840.440.428.80000028.800000
7InternLM-7B38.738.743.943.945.245.251.40000051.400000
8LLaMA-2-7B39.54045.449.548.246.852.00000055.200000
9Baichuan2-13B-Chat14.115.324.125.832.333.125.60000027.700000
10GPT-4//////88.70000088.700000
11AquilaChat2-34B36.6336.6344.8344.8346.6546.65NaNNaN
12Yi-34B-Chat57.7559.1465.1168.7968.1668.3778.09000080.060000
13DevOps-Model-14B-Chat30.6930.5955.7763.6363.8561.9641.15000044.010000
14Qwen-72B-Chat70.4170.5072.3872.5670.3270.3270.13000070.220000
15Mistral-7B29.2729.2746.3046.3047.2247.2245.58000045.580000
16Qwen-14B-Chat43.7847.8156.5859.4062.0959.7049.06000055.880000
17LLaMA-2-70B-Chat25.2925.2957.9758.0652.9752.9758.55000058.550000
18ERNIE-Bot-4.061.1561.1570.0070.0060.0060.0070.00000070.000000
19ChatGLM3-6B43.3848797343.3848797344.5876288744.5876288742.0962199342.0962199343.47079043.470790
20InternLM2-Chat-20B56.3573883256.3573883226.1802575126.1802575160.4810996660.4810996645.10309345.103093
21InternLM2-Chat-7B49.7422680449.7422680456.1855670156.1855670148.1958762948.1958762949.74226849.742268
22gemma_2b26.4604826.4604833.4192433.4192426.632326.632337.54296037.542960
23gemma_7b25.0859125.0859150.8591150.8591130.2405530.2405551.55747051.557470
24qwen1.5-14b-base34.8797334.8797360.8247460.8247465.5498365.5498347.07904047.079040
25qwen1.5-14b-chat54.8969156.443364.0893567.0962252.2336853.5223459.53608064.175260
\n", "
" ], "text/plain": [ " name zero_native zero_self_con zero_cot \\\n", "0 Baichuan-13B-Chat 18.3 20.4 28.6 \n", "1 Chinese-Alpaca-2-13B 37.7 37.7 49.7 \n", "2 GPT-3.5-turbo 66.6 66.8 69.6 \n", "3 LLaMA-2-13B 41.8 46.5 53.1 \n", "4 Qwen-7B-Chat 45.9 46 47.3 \n", "5 ChatGLM2-6B 24.8 24.7 36.6 \n", "6 Chinese-LLaMA-2-13B 29.4 29.4 37.8 \n", "7 InternLM-7B 38.7 38.7 43.9 \n", "8 LLaMA-2-7B 39.5 40 45.4 \n", "9 Baichuan2-13B-Chat 14.1 15.3 24.1 \n", "10 GPT-4 / / / \n", "11 AquilaChat2-34B 36.63 36.63 44.83 \n", "12 Yi-34B-Chat 57.75 59.14 65.11 \n", "13 DevOps-Model-14B-Chat 30.69 30.59 55.77 \n", "14 Qwen-72B-Chat 70.41 70.50 72.38 \n", "15 Mistral-7B 29.27 29.27 46.30 \n", "16 Qwen-14B-Chat 43.78 47.81 56.58 \n", "17 LLaMA-2-70B-Chat 25.29 25.29 57.97 \n", "18 ERNIE-Bot-4.0 61.15 61.15 70.00 \n", "19 ChatGLM3-6B 43.38487973 43.38487973 44.58762887 \n", "20 InternLM2-Chat-20B 56.35738832 56.35738832 26.18025751 \n", "21 InternLM2-Chat-7B 49.74226804 49.74226804 56.18556701 \n", "22 gemma_2b 26.46048 26.46048 33.41924 \n", "23 gemma_7b 25.08591 25.08591 50.85911 \n", "24 qwen1.5-14b-base 34.87973 34.87973 60.82474 \n", "25 qwen1.5-14b-chat 54.89691 56.4433 64.08935 \n", "\n", " zero_cot_self_con few_native few_self_con few_cot few_cot_self_con \n", "0 37 24.1 26.7 18.200000 17.800000 \n", "1 49.7 48.6 48.6 50.500000 50.500000 \n", "2 72 68.3 68.3 70.900000 72.500000 \n", "3 58.7 53.3 53 56.800000 61.000000 \n", "4 50.1 52.1 51 48.300000 49.800000 \n", "5 36.5 37.6 37.6 40.500000 40.500000 \n", "6 37.8 40.4 40.4 28.800000 28.800000 \n", "7 43.9 45.2 45.2 51.400000 51.400000 \n", "8 49.5 48.2 46.8 52.000000 55.200000 \n", "9 25.8 32.3 33.1 25.600000 27.700000 \n", "10 / / / 88.700000 88.700000 \n", "11 44.83 46.65 46.65 NaN NaN \n", "12 68.79 68.16 68.37 78.090000 80.060000 \n", "13 63.63 63.85 61.96 41.150000 44.010000 \n", "14 72.56 70.32 70.32 70.130000 70.220000 \n", "15 46.30 47.22 47.22 45.580000 45.580000 \n", "16 59.40 62.09 59.70 49.060000 55.880000 \n", "17 58.06 52.97 52.97 58.550000 58.550000 \n", "18 70.00 60.00 60.00 70.000000 70.000000 \n", "19 44.58762887 42.09621993 42.09621993 43.470790 43.470790 \n", "20 26.18025751 60.48109966 60.48109966 45.103093 45.103093 \n", "21 56.18556701 48.19587629 48.19587629 49.742268 49.742268 \n", "22 33.41924 26.6323 26.6323 37.542960 37.542960 \n", "23 50.85911 30.24055 30.24055 51.557470 51.557470 \n", "24 60.82474 65.54983 65.54983 47.079040 47.079040 \n", "25 67.09622 52.23368 53.52234 59.536080 64.175260 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"./data/network_en_mc.csv\")\n", "df" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NaiveSCCoTCoT+SC
0NaNNaNNaNNaN
157.7559.1465.1168.79
270.4170.5072.3872.56
366.6066.8069.6072.00
461.1561.1570.0070.00
554.9056.4464.0967.10
634.8834.8860.8260.82
730.6930.5955.7763.63
843.7847.8156.5859.40
941.8046.5053.1058.70
1056.3656.3626.1826.18
1125.2925.2957.9758.06
1249.7449.7456.1956.19
1339.5040.0045.4049.50
1445.9046.0047.3050.10
1525.0925.0950.8650.86
1638.7038.7043.9043.90
1737.7037.7049.7049.70
1829.2729.2746.3046.30
1936.6336.6344.8344.83
2043.3843.3844.5944.59
2124.8024.7036.6036.50
2229.4029.4037.8037.80
2326.4626.4633.4233.42
2418.3020.4028.6037.00
2514.1015.3024.1025.80
\n", "
" ], "text/plain": [ " Naive SC CoT CoT+SC\n", "0 NaN NaN NaN NaN\n", "1 57.75 59.14 65.11 68.79\n", "2 70.41 70.50 72.38 72.56\n", "3 66.60 66.80 69.60 72.00\n", "4 61.15 61.15 70.00 70.00\n", "5 54.90 56.44 64.09 67.10\n", "6 34.88 34.88 60.82 60.82\n", "7 30.69 30.59 55.77 63.63\n", "8 43.78 47.81 56.58 59.40\n", "9 41.80 46.50 53.10 58.70\n", "10 56.36 56.36 26.18 26.18\n", "11 25.29 25.29 57.97 58.06\n", "12 49.74 49.74 56.19 56.19\n", "13 39.50 40.00 45.40 49.50\n", "14 45.90 46.00 47.30 50.10\n", "15 25.09 25.09 50.86 50.86\n", "16 38.70 38.70 43.90 43.90\n", "17 37.70 37.70 49.70 49.70\n", "18 29.27 29.27 46.30 46.30\n", "19 36.63 36.63 44.83 44.83\n", "20 43.38 43.38 44.59 44.59\n", "21 24.80 24.70 36.60 36.50\n", "22 29.40 29.40 37.80 37.80\n", "23 26.46 26.46 33.42 33.42\n", "24 18.30 20.40 28.60 37.00\n", "25 14.10 15.30 24.10 25.80" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def process_mc_df(df):\n", " # 将name列重命名为Model\n", " df = df.rename(columns={\"name\": \"Model\"})\n", " # 将zero_naive, zero_self_con, zero_cot, zero_cot_self_con, few_naive, few_self_con, few_cot, few_cot_self_con列重新组织成MultiIndex,一层为Zeroshot, Fewshot,一层为Naive, Self-Consistency, CoT, CoT+Self-Consistency\n", " df = df.set_index(\"Model\")\n", " # df = df.stack().unstack()\n", " df.columns = pd.MultiIndex.from_tuples([(\"Zeroshot\", \"Naive\"), (\"Zeroshot\", \"SC\"), (\"Zeroshot\", \"CoT\"), (\"Zeroshot\", \"CoT+SC\"), (\"Fewshot\", \"Naive\"), (\"Fewshot\", \"SC\"), (\"Fewshot\", \"CoT\"), (\"Fewshot\", \"CoT+SC\")])\n", " # 将除了Model列之外的列的value转换为数值型,失败的为NaN\n", " df = df.apply(pd.to_numeric, errors=\"coerce\")\n", " # 显示小数点后两位\n", " df = df.round(2)\n", " # 给每一行添加一列BestScore\n", " df[\"BestScore\"] = df.max(axis=1)\n", " # 根据BestScore给df排序\n", " df = df.sort_values(by=\"BestScore\", ascending=False)\n", " # \n", " df = df.reset_index()\n", " return df\n", "\n", "processed = process_mc_df(df)\n", "processed.columns\n", "processed['Zeroshot']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "opencompass", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }