{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os, json"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" zero_native | \n",
" zero_self_con | \n",
" zero_cot | \n",
" zero_cot_self_con | \n",
" few_native | \n",
" few_self_con | \n",
" few_cot | \n",
" few_cot_self_con | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Baichuan-13B-Chat | \n",
" 18.3 | \n",
" 20.4 | \n",
" 28.6 | \n",
" 37 | \n",
" 24.1 | \n",
" 26.7 | \n",
" 18.200000 | \n",
" 17.800000 | \n",
"
\n",
" \n",
" 1 | \n",
" Chinese-Alpaca-2-13B | \n",
" 37.7 | \n",
" 37.7 | \n",
" 49.7 | \n",
" 49.7 | \n",
" 48.6 | \n",
" 48.6 | \n",
" 50.500000 | \n",
" 50.500000 | \n",
"
\n",
" \n",
" 2 | \n",
" GPT-3.5-turbo | \n",
" 66.6 | \n",
" 66.8 | \n",
" 69.6 | \n",
" 72 | \n",
" 68.3 | \n",
" 68.3 | \n",
" 70.900000 | \n",
" 72.500000 | \n",
"
\n",
" \n",
" 3 | \n",
" LLaMA-2-13B | \n",
" 41.8 | \n",
" 46.5 | \n",
" 53.1 | \n",
" 58.7 | \n",
" 53.3 | \n",
" 53 | \n",
" 56.800000 | \n",
" 61.000000 | \n",
"
\n",
" \n",
" 4 | \n",
" Qwen-7B-Chat | \n",
" 45.9 | \n",
" 46 | \n",
" 47.3 | \n",
" 50.1 | \n",
" 52.1 | \n",
" 51 | \n",
" 48.300000 | \n",
" 49.800000 | \n",
"
\n",
" \n",
" 5 | \n",
" ChatGLM2-6B | \n",
" 24.8 | \n",
" 24.7 | \n",
" 36.6 | \n",
" 36.5 | \n",
" 37.6 | \n",
" 37.6 | \n",
" 40.500000 | \n",
" 40.500000 | \n",
"
\n",
" \n",
" 6 | \n",
" Chinese-LLaMA-2-13B | \n",
" 29.4 | \n",
" 29.4 | \n",
" 37.8 | \n",
" 37.8 | \n",
" 40.4 | \n",
" 40.4 | \n",
" 28.800000 | \n",
" 28.800000 | \n",
"
\n",
" \n",
" 7 | \n",
" InternLM-7B | \n",
" 38.7 | \n",
" 38.7 | \n",
" 43.9 | \n",
" 43.9 | \n",
" 45.2 | \n",
" 45.2 | \n",
" 51.400000 | \n",
" 51.400000 | \n",
"
\n",
" \n",
" 8 | \n",
" LLaMA-2-7B | \n",
" 39.5 | \n",
" 40 | \n",
" 45.4 | \n",
" 49.5 | \n",
" 48.2 | \n",
" 46.8 | \n",
" 52.000000 | \n",
" 55.200000 | \n",
"
\n",
" \n",
" 9 | \n",
" Baichuan2-13B-Chat | \n",
" 14.1 | \n",
" 15.3 | \n",
" 24.1 | \n",
" 25.8 | \n",
" 32.3 | \n",
" 33.1 | \n",
" 25.600000 | \n",
" 27.700000 | \n",
"
\n",
" \n",
" 10 | \n",
" GPT-4 | \n",
" / | \n",
" / | \n",
" / | \n",
" / | \n",
" / | \n",
" / | \n",
" 88.700000 | \n",
" 88.700000 | \n",
"
\n",
" \n",
" 11 | \n",
" AquilaChat2-34B | \n",
" 36.63 | \n",
" 36.63 | \n",
" 44.83 | \n",
" 44.83 | \n",
" 46.65 | \n",
" 46.65 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 12 | \n",
" Yi-34B-Chat | \n",
" 57.75 | \n",
" 59.14 | \n",
" 65.11 | \n",
" 68.79 | \n",
" 68.16 | \n",
" 68.37 | \n",
" 78.090000 | \n",
" 80.060000 | \n",
"
\n",
" \n",
" 13 | \n",
" DevOps-Model-14B-Chat | \n",
" 30.69 | \n",
" 30.59 | \n",
" 55.77 | \n",
" 63.63 | \n",
" 63.85 | \n",
" 61.96 | \n",
" 41.150000 | \n",
" 44.010000 | \n",
"
\n",
" \n",
" 14 | \n",
" Qwen-72B-Chat | \n",
" 70.41 | \n",
" 70.50 | \n",
" 72.38 | \n",
" 72.56 | \n",
" 70.32 | \n",
" 70.32 | \n",
" 70.130000 | \n",
" 70.220000 | \n",
"
\n",
" \n",
" 15 | \n",
" Mistral-7B | \n",
" 29.27 | \n",
" 29.27 | \n",
" 46.30 | \n",
" 46.30 | \n",
" 47.22 | \n",
" 47.22 | \n",
" 45.580000 | \n",
" 45.580000 | \n",
"
\n",
" \n",
" 16 | \n",
" Qwen-14B-Chat | \n",
" 43.78 | \n",
" 47.81 | \n",
" 56.58 | \n",
" 59.40 | \n",
" 62.09 | \n",
" 59.70 | \n",
" 49.060000 | \n",
" 55.880000 | \n",
"
\n",
" \n",
" 17 | \n",
" LLaMA-2-70B-Chat | \n",
" 25.29 | \n",
" 25.29 | \n",
" 57.97 | \n",
" 58.06 | \n",
" 52.97 | \n",
" 52.97 | \n",
" 58.550000 | \n",
" 58.550000 | \n",
"
\n",
" \n",
" 18 | \n",
" ERNIE-Bot-4.0 | \n",
" 61.15 | \n",
" 61.15 | \n",
" 70.00 | \n",
" 70.00 | \n",
" 60.00 | \n",
" 60.00 | \n",
" 70.000000 | \n",
" 70.000000 | \n",
"
\n",
" \n",
" 19 | \n",
" ChatGLM3-6B | \n",
" 43.38487973 | \n",
" 43.38487973 | \n",
" 44.58762887 | \n",
" 44.58762887 | \n",
" 42.09621993 | \n",
" 42.09621993 | \n",
" 43.470790 | \n",
" 43.470790 | \n",
"
\n",
" \n",
" 20 | \n",
" InternLM2-Chat-20B | \n",
" 56.35738832 | \n",
" 56.35738832 | \n",
" 26.18025751 | \n",
" 26.18025751 | \n",
" 60.48109966 | \n",
" 60.48109966 | \n",
" 45.103093 | \n",
" 45.103093 | \n",
"
\n",
" \n",
" 21 | \n",
" InternLM2-Chat-7B | \n",
" 49.74226804 | \n",
" 49.74226804 | \n",
" 56.18556701 | \n",
" 56.18556701 | \n",
" 48.19587629 | \n",
" 48.19587629 | \n",
" 49.742268 | \n",
" 49.742268 | \n",
"
\n",
" \n",
" 22 | \n",
" gemma_2b | \n",
" 26.46048 | \n",
" 26.46048 | \n",
" 33.41924 | \n",
" 33.41924 | \n",
" 26.6323 | \n",
" 26.6323 | \n",
" 37.542960 | \n",
" 37.542960 | \n",
"
\n",
" \n",
" 23 | \n",
" gemma_7b | \n",
" 25.08591 | \n",
" 25.08591 | \n",
" 50.85911 | \n",
" 50.85911 | \n",
" 30.24055 | \n",
" 30.24055 | \n",
" 51.557470 | \n",
" 51.557470 | \n",
"
\n",
" \n",
" 24 | \n",
" qwen1.5-14b-base | \n",
" 34.87973 | \n",
" 34.87973 | \n",
" 60.82474 | \n",
" 60.82474 | \n",
" 65.54983 | \n",
" 65.54983 | \n",
" 47.079040 | \n",
" 47.079040 | \n",
"
\n",
" \n",
" 25 | \n",
" qwen1.5-14b-chat | \n",
" 54.89691 | \n",
" 56.4433 | \n",
" 64.08935 | \n",
" 67.09622 | \n",
" 52.23368 | \n",
" 53.52234 | \n",
" 59.536080 | \n",
" 64.175260 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name zero_native zero_self_con zero_cot \\\n",
"0 Baichuan-13B-Chat 18.3 20.4 28.6 \n",
"1 Chinese-Alpaca-2-13B 37.7 37.7 49.7 \n",
"2 GPT-3.5-turbo 66.6 66.8 69.6 \n",
"3 LLaMA-2-13B 41.8 46.5 53.1 \n",
"4 Qwen-7B-Chat 45.9 46 47.3 \n",
"5 ChatGLM2-6B 24.8 24.7 36.6 \n",
"6 Chinese-LLaMA-2-13B 29.4 29.4 37.8 \n",
"7 InternLM-7B 38.7 38.7 43.9 \n",
"8 LLaMA-2-7B 39.5 40 45.4 \n",
"9 Baichuan2-13B-Chat 14.1 15.3 24.1 \n",
"10 GPT-4 / / / \n",
"11 AquilaChat2-34B 36.63 36.63 44.83 \n",
"12 Yi-34B-Chat 57.75 59.14 65.11 \n",
"13 DevOps-Model-14B-Chat 30.69 30.59 55.77 \n",
"14 Qwen-72B-Chat 70.41 70.50 72.38 \n",
"15 Mistral-7B 29.27 29.27 46.30 \n",
"16 Qwen-14B-Chat 43.78 47.81 56.58 \n",
"17 LLaMA-2-70B-Chat 25.29 25.29 57.97 \n",
"18 ERNIE-Bot-4.0 61.15 61.15 70.00 \n",
"19 ChatGLM3-6B 43.38487973 43.38487973 44.58762887 \n",
"20 InternLM2-Chat-20B 56.35738832 56.35738832 26.18025751 \n",
"21 InternLM2-Chat-7B 49.74226804 49.74226804 56.18556701 \n",
"22 gemma_2b 26.46048 26.46048 33.41924 \n",
"23 gemma_7b 25.08591 25.08591 50.85911 \n",
"24 qwen1.5-14b-base 34.87973 34.87973 60.82474 \n",
"25 qwen1.5-14b-chat 54.89691 56.4433 64.08935 \n",
"\n",
" zero_cot_self_con few_native few_self_con few_cot few_cot_self_con \n",
"0 37 24.1 26.7 18.200000 17.800000 \n",
"1 49.7 48.6 48.6 50.500000 50.500000 \n",
"2 72 68.3 68.3 70.900000 72.500000 \n",
"3 58.7 53.3 53 56.800000 61.000000 \n",
"4 50.1 52.1 51 48.300000 49.800000 \n",
"5 36.5 37.6 37.6 40.500000 40.500000 \n",
"6 37.8 40.4 40.4 28.800000 28.800000 \n",
"7 43.9 45.2 45.2 51.400000 51.400000 \n",
"8 49.5 48.2 46.8 52.000000 55.200000 \n",
"9 25.8 32.3 33.1 25.600000 27.700000 \n",
"10 / / / 88.700000 88.700000 \n",
"11 44.83 46.65 46.65 NaN NaN \n",
"12 68.79 68.16 68.37 78.090000 80.060000 \n",
"13 63.63 63.85 61.96 41.150000 44.010000 \n",
"14 72.56 70.32 70.32 70.130000 70.220000 \n",
"15 46.30 47.22 47.22 45.580000 45.580000 \n",
"16 59.40 62.09 59.70 49.060000 55.880000 \n",
"17 58.06 52.97 52.97 58.550000 58.550000 \n",
"18 70.00 60.00 60.00 70.000000 70.000000 \n",
"19 44.58762887 42.09621993 42.09621993 43.470790 43.470790 \n",
"20 26.18025751 60.48109966 60.48109966 45.103093 45.103093 \n",
"21 56.18556701 48.19587629 48.19587629 49.742268 49.742268 \n",
"22 33.41924 26.6323 26.6323 37.542960 37.542960 \n",
"23 50.85911 30.24055 30.24055 51.557470 51.557470 \n",
"24 60.82474 65.54983 65.54983 47.079040 47.079040 \n",
"25 67.09622 52.23368 53.52234 59.536080 64.175260 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"./data/network_en_mc.csv\")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Naive | \n",
" SC | \n",
" CoT | \n",
" CoT+SC | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 57.75 | \n",
" 59.14 | \n",
" 65.11 | \n",
" 68.79 | \n",
"
\n",
" \n",
" 2 | \n",
" 70.41 | \n",
" 70.50 | \n",
" 72.38 | \n",
" 72.56 | \n",
"
\n",
" \n",
" 3 | \n",
" 66.60 | \n",
" 66.80 | \n",
" 69.60 | \n",
" 72.00 | \n",
"
\n",
" \n",
" 4 | \n",
" 61.15 | \n",
" 61.15 | \n",
" 70.00 | \n",
" 70.00 | \n",
"
\n",
" \n",
" 5 | \n",
" 54.90 | \n",
" 56.44 | \n",
" 64.09 | \n",
" 67.10 | \n",
"
\n",
" \n",
" 6 | \n",
" 34.88 | \n",
" 34.88 | \n",
" 60.82 | \n",
" 60.82 | \n",
"
\n",
" \n",
" 7 | \n",
" 30.69 | \n",
" 30.59 | \n",
" 55.77 | \n",
" 63.63 | \n",
"
\n",
" \n",
" 8 | \n",
" 43.78 | \n",
" 47.81 | \n",
" 56.58 | \n",
" 59.40 | \n",
"
\n",
" \n",
" 9 | \n",
" 41.80 | \n",
" 46.50 | \n",
" 53.10 | \n",
" 58.70 | \n",
"
\n",
" \n",
" 10 | \n",
" 56.36 | \n",
" 56.36 | \n",
" 26.18 | \n",
" 26.18 | \n",
"
\n",
" \n",
" 11 | \n",
" 25.29 | \n",
" 25.29 | \n",
" 57.97 | \n",
" 58.06 | \n",
"
\n",
" \n",
" 12 | \n",
" 49.74 | \n",
" 49.74 | \n",
" 56.19 | \n",
" 56.19 | \n",
"
\n",
" \n",
" 13 | \n",
" 39.50 | \n",
" 40.00 | \n",
" 45.40 | \n",
" 49.50 | \n",
"
\n",
" \n",
" 14 | \n",
" 45.90 | \n",
" 46.00 | \n",
" 47.30 | \n",
" 50.10 | \n",
"
\n",
" \n",
" 15 | \n",
" 25.09 | \n",
" 25.09 | \n",
" 50.86 | \n",
" 50.86 | \n",
"
\n",
" \n",
" 16 | \n",
" 38.70 | \n",
" 38.70 | \n",
" 43.90 | \n",
" 43.90 | \n",
"
\n",
" \n",
" 17 | \n",
" 37.70 | \n",
" 37.70 | \n",
" 49.70 | \n",
" 49.70 | \n",
"
\n",
" \n",
" 18 | \n",
" 29.27 | \n",
" 29.27 | \n",
" 46.30 | \n",
" 46.30 | \n",
"
\n",
" \n",
" 19 | \n",
" 36.63 | \n",
" 36.63 | \n",
" 44.83 | \n",
" 44.83 | \n",
"
\n",
" \n",
" 20 | \n",
" 43.38 | \n",
" 43.38 | \n",
" 44.59 | \n",
" 44.59 | \n",
"
\n",
" \n",
" 21 | \n",
" 24.80 | \n",
" 24.70 | \n",
" 36.60 | \n",
" 36.50 | \n",
"
\n",
" \n",
" 22 | \n",
" 29.40 | \n",
" 29.40 | \n",
" 37.80 | \n",
" 37.80 | \n",
"
\n",
" \n",
" 23 | \n",
" 26.46 | \n",
" 26.46 | \n",
" 33.42 | \n",
" 33.42 | \n",
"
\n",
" \n",
" 24 | \n",
" 18.30 | \n",
" 20.40 | \n",
" 28.60 | \n",
" 37.00 | \n",
"
\n",
" \n",
" 25 | \n",
" 14.10 | \n",
" 15.30 | \n",
" 24.10 | \n",
" 25.80 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Naive SC CoT CoT+SC\n",
"0 NaN NaN NaN NaN\n",
"1 57.75 59.14 65.11 68.79\n",
"2 70.41 70.50 72.38 72.56\n",
"3 66.60 66.80 69.60 72.00\n",
"4 61.15 61.15 70.00 70.00\n",
"5 54.90 56.44 64.09 67.10\n",
"6 34.88 34.88 60.82 60.82\n",
"7 30.69 30.59 55.77 63.63\n",
"8 43.78 47.81 56.58 59.40\n",
"9 41.80 46.50 53.10 58.70\n",
"10 56.36 56.36 26.18 26.18\n",
"11 25.29 25.29 57.97 58.06\n",
"12 49.74 49.74 56.19 56.19\n",
"13 39.50 40.00 45.40 49.50\n",
"14 45.90 46.00 47.30 50.10\n",
"15 25.09 25.09 50.86 50.86\n",
"16 38.70 38.70 43.90 43.90\n",
"17 37.70 37.70 49.70 49.70\n",
"18 29.27 29.27 46.30 46.30\n",
"19 36.63 36.63 44.83 44.83\n",
"20 43.38 43.38 44.59 44.59\n",
"21 24.80 24.70 36.60 36.50\n",
"22 29.40 29.40 37.80 37.80\n",
"23 26.46 26.46 33.42 33.42\n",
"24 18.30 20.40 28.60 37.00\n",
"25 14.10 15.30 24.10 25.80"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def process_mc_df(df):\n",
" # 将name列重命名为Model\n",
" df = df.rename(columns={\"name\": \"Model\"})\n",
" # 将zero_naive, zero_self_con, zero_cot, zero_cot_self_con, few_naive, few_self_con, few_cot, few_cot_self_con列重新组织成MultiIndex,一层为Zeroshot, Fewshot,一层为Naive, Self-Consistency, CoT, CoT+Self-Consistency\n",
" df = df.set_index(\"Model\")\n",
" # df = df.stack().unstack()\n",
" df.columns = pd.MultiIndex.from_tuples([(\"Zeroshot\", \"Naive\"), (\"Zeroshot\", \"SC\"), (\"Zeroshot\", \"CoT\"), (\"Zeroshot\", \"CoT+SC\"), (\"Fewshot\", \"Naive\"), (\"Fewshot\", \"SC\"), (\"Fewshot\", \"CoT\"), (\"Fewshot\", \"CoT+SC\")])\n",
" # 将除了Model列之外的列的value转换为数值型,失败的为NaN\n",
" df = df.apply(pd.to_numeric, errors=\"coerce\")\n",
" # 显示小数点后两位\n",
" df = df.round(2)\n",
" # 给每一行添加一列BestScore\n",
" df[\"BestScore\"] = df.max(axis=1)\n",
" # 根据BestScore给df排序\n",
" df = df.sort_values(by=\"BestScore\", ascending=False)\n",
" # \n",
" df = df.reset_index()\n",
" return df\n",
"\n",
"processed = process_mc_df(df)\n",
"processed.columns\n",
"processed['Zeroshot']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "opencompass",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}