shopping_mmlu_leaderboard / ShoppingMMLU_overall.json
Yilun Jin
update overall leaderboard
b0464d1
{
"time": "241031154353",
"results": {
"Claude3-Sonnet": {
"META": {
"Method": [
"Claude3-Sonnet",
"https://aws.amazon.com/bedrock/claude/"
],
"Parameters": "",
"Org": "Anthropic",
"OpenSource": "No",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 80.75
},
"Shopping Knowledge Reasoning": {
"Overall": 71.63
},
"User Behavior Alignment": {
"Overall": 70.17
},
"Multi-lingual Abilities": {
"Overall": 67.76
}
},
"Claude2": {
"META": {
"Method": [
"Claude2",
"https://aws.amazon.com/bedrock/claude/"
],
"Parameters": "",
"Org": "Anthropic",
"OpenSource": "No",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 75.46
},
"Shopping Knowledge Reasoning": {
"Overall": 65.5
},
"User Behavior Alignment": {
"Overall": 63.53
},
"Multi-lingual Abilities": {
"Overall": 65.24
}
},
"ChatGPT": {
"META": {
"Method": [
"ChatGPT",
"https://platform.openai.com/docs/models#gpt-3-5-turbo"
],
"Parameters": "",
"Org": "OpenAI",
"OpenSource": "No",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 75.63
},
"Shopping Knowledge Reasoning": {
"Overall": 64.97
},
"User Behavior Alignment": {
"Overall": 59.79
},
"Multi-lingual Abilities": {
"Overall": 60.81
}
},
"LLaMA3-70B-Instruct": {
"META": {
"Method": [
"LLaMA3-70B-Instruct",
"https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct"
],
"Parameters": "70B",
"Org": "Meta",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 75.24
},
"Shopping Knowledge Reasoning": {
"Overall": 69.29
},
"User Behavior Alignment": {
"Overall": 67.67
},
"Multi-lingual Abilities": {
"Overall": 62.0
}
},
"QWen1.5-72B": {
"META": {
"Method": [
"QWen1.5-72B",
"https://huggingface.co/Qwen/Qwen1.5-72B"
],
"Parameters": "72B",
"Org": "Alibaba",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 71.67
},
"Shopping Knowledge Reasoning": {
"Overall": 68.92
},
"User Behavior Alignment": {
"Overall": 64.12
},
"Multi-lingual Abilities": {
"Overall": 64.84
}
},
"LLaMA3-70B": {
"META": {
"Method": [
"LLaMA3-70B",
"https://huggingface.co/meta-llama/Meta-Llama-3-70B"
],
"Parameters": "70B",
"Org": "Meta",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 69.59
},
"Shopping Knowledge Reasoning": {
"Overall": 63.56
},
"User Behavior Alignment": {
"Overall": 55.77
},
"Multi-lingual Abilities": {
"Overall": 58.95
}
},
"LLaMA2-70B-Chat": {
"META": {
"Method": [
"LLaMA2-70B-Chat",
"https://huggingface.co/meta-llama/Llama-2-70b-chat-hf"
],
"Parameters": "70B",
"Org": "Meta",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 61.84
},
"Shopping Knowledge Reasoning": {
"Overall": 40.73
},
"User Behavior Alignment": {
"Overall": 44.2
},
"Multi-lingual Abilities": {
"Overall": 47.04
}
},
"LLaMA2-70B": {
"META": {
"Method": [
"LLaMA2-70B",
"https://huggingface.co/meta-llama/Llama-2-70b-hf"
],
"Parameters": "70B",
"Org": "Meta",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 61.05
},
"Shopping Knowledge Reasoning": {
"Overall": 55.87
},
"User Behavior Alignment": {
"Overall": 43.24
},
"Multi-lingual Abilities": {
"Overall": 47.85
}
},
"Mixtral-8x7B": {
"META": {
"Method": [
"Mixtral-8x7B",
"https://huggingface.co/mistralai/Mixtral-8x7B-v0.1"
],
"Parameters": "46.7B",
"Org": "MistralAI",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 59.43
},
"Shopping Knowledge Reasoning": {
"Overall": 54.32
},
"User Behavior Alignment": {
"Overall": 55.31
},
"Multi-lingual Abilities": {
"Overall": 44.69
}
},
"QWen1.5-14B": {
"META": {
"Method": [
"QWen1.5-14B",
"https://huggingface.co/Qwen/Qwen1.5-14B"
],
"Parameters": "14B",
"Org": "Alibaba",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 67.22
},
"Shopping Knowledge Reasoning": {
"Overall": 60.92
},
"User Behavior Alignment": {
"Overall": 54.92
},
"Multi-lingual Abilities": {
"Overall": 55.21
}
},
"eCeLLM-L": {
"META": {
"Method": [
"eCeLLM-L",
"https://huggingface.co/NingLab/eCeLLM-L"
],
"Parameters": "13B",
"Org": "OSU NingLab",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 61.54
},
"Shopping Knowledge Reasoning": {
"Overall": 54.84
},
"User Behavior Alignment": {
"Overall": 54.55
},
"Multi-lingual Abilities": {
"Overall": 59.64
}
},
"Vicuna-13B-v1.5": {
"META": {
"Method": [
"Vicuna-13B-v1.5",
"https://huggingface.co/lmsys/vicuna-13b-v1.5"
],
"Parameters": "13B",
"Org": "LMSys",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 59.64
},
"Shopping Knowledge Reasoning": {
"Overall": 52.63
},
"User Behavior Alignment": {
"Overall": 49.81
},
"Multi-lingual Abilities": {
"Overall": 49.64
}
},
"LLaMA2-13B-Chat": {
"META": {
"Method": [
"LLaMA2-13B-Chat",
"https://huggingface.co/meta-llama/Llama-2-13b-chat-hf"
],
"Parameters": "13B",
"Org": "Meta",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 51.79
},
"Shopping Knowledge Reasoning": {
"Overall": 45.01
},
"User Behavior Alignment": {
"Overall": 39.95
},
"Multi-lingual Abilities": {
"Overall": 42.99
}
},
"LLaMA2-13B": {
"META": {
"Method": [
"LLaMA2-13B",
"https://huggingface.co/meta-llama/Llama-2-13b-hf"
],
"Parameters": "13B",
"Org": "Meta",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 45.86
},
"Shopping Knowledge Reasoning": {
"Overall": 39.47
},
"User Behavior Alignment": {
"Overall": 39.43
},
"Multi-lingual Abilities": {
"Overall": 44.23
}
},
"LLaMA3-8B-Instruct": {
"META": {
"Method": [
"LLaMA3-8B-Instruct",
"https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"
],
"Parameters": "8B",
"Org": "Meta",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 65.26
},
"Shopping Knowledge Reasoning": {
"Overall": 56.84
},
"User Behavior Alignment": {
"Overall": 54.88
},
"Multi-lingual Abilities": {
"Overall": 55.37
}
},
"LLaMA3-8B": {
"META": {
"Method": [
"LLaMA3-8B",
"https://huggingface.co/meta-llama/Meta-Llama-3-8B"
],
"Parameters": "8B",
"Org": "Meta",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 58.02
},
"Shopping Knowledge Reasoning": {
"Overall": 49.74
},
"User Behavior Alignment": {
"Overall": 44.16
},
"Multi-lingual Abilities": {
"Overall": 51.03
}
},
"QWen1.5-7B": {
"META": {
"Method": [
"QWen1.5-7B",
"https://huggingface.co/Qwen/Qwen1.5-7B"
],
"Parameters": "7B",
"Org": "Alibaba",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 58.89
},
"Shopping Knowledge Reasoning": {
"Overall": 52.34
},
"User Behavior Alignment": {
"Overall": 49.81
},
"Multi-lingual Abilities": {
"Overall": 50.14
}
},
"eCeLLM-M": {
"META": {
"Method": [
"eCeLLM-M",
"https://huggingface.co/NingLab/eCeLLM-M"
],
"Parameters": "7B",
"Org": "OSU NingLab",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 63.29
},
"Shopping Knowledge Reasoning": {
"Overall": 48.94
},
"User Behavior Alignment": {
"Overall": 53.78
},
"Multi-lingual Abilities": {
"Overall": 56.08
}
},
"Zephyr-Beta": {
"META": {
"Method": [
"Zephyr-Beta",
"https://huggingface.co/HuggingFaceH4/zephyr-7b-beta"
],
"Parameters": "7B",
"Org": "HuggingFace H4",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 61.65
},
"Shopping Knowledge Reasoning": {
"Overall": 52.57
},
"User Behavior Alignment": {
"Overall": 44.73
},
"Multi-lingual Abilities": {
"Overall": 45.35
}
},
"Mistral-7B-Instruct": {
"META": {
"Method": [
"Mistral-7B-Instruct",
"https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2"
],
"Parameters": "7B",
"Org": "MistralAI",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 62.03
},
"Shopping Knowledge Reasoning": {
"Overall": 46.36
},
"User Behavior Alignment": {
"Overall": 42.21
},
"Multi-lingual Abilities": {
"Overall": 43.32
}
},
"Mistral-7B": {
"META": {
"Method": [
"Mistral-7B",
"https://huggingface.co/mistralai/Mistral-7B-v0.1"
],
"Parameters": "7B",
"Org": "MistralAI",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 55.82
},
"Shopping Knowledge Reasoning": {
"Overall": 46.69
},
"User Behavior Alignment": {
"Overall": 46.27
},
"Multi-lingual Abilities": {
"Overall": 41.47
}
},
"Vicuna-7B-v1.5": {
"META": {
"Method": [
"Vicuna-7B-v1.5",
"https://huggingface.co/lmsys/vicuna-7b-v1.5"
],
"Parameters": "7B",
"Org": "LMSys",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 53.46
},
"Shopping Knowledge Reasoning": {
"Overall": 45.06
},
"User Behavior Alignment": {
"Overall": 41.11
},
"Multi-lingual Abilities": {
"Overall": 43.82
}
},
"LLaMA2-7B-Chat": {
"META": {
"Method": [
"LLaMA2-7B-Chat",
"https://huggingface.co/meta-llama/Llama-2-7b-chat-hf"
],
"Parameters": "7B",
"Org": "Meta",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 51.67
},
"Shopping Knowledge Reasoning": {
"Overall": 43.48
},
"User Behavior Alignment": {
"Overall": 41.42
},
"Multi-lingual Abilities": {
"Overall": 40.43
}
},
"LLaMA2-7B": {
"META": {
"Method": [
"LLaMA2-7B",
"https://huggingface.co/meta-llama/Llama-2-7b-hf"
],
"Parameters": "7B",
"Org": "Meta",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 38.22
},
"Shopping Knowledge Reasoning": {
"Overall": 32.81
},
"User Behavior Alignment": {
"Overall": 32.56
},
"Multi-lingual Abilities": {
"Overall": 27.71
}
},
"QWen1.5-4B": {
"META": {
"Method": [
"QWen1.5-4B",
"https://huggingface.co/Qwen/Qwen1.5-4B"
],
"Parameters": "4B",
"Org": "Alibaba",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 57.21
},
"Shopping Knowledge Reasoning": {
"Overall": 52.56
},
"User Behavior Alignment": {
"Overall": 42.74
},
"Multi-lingual Abilities": {
"Overall": 49.78
}
},
"Phi-2": {
"META": {
"Method": [
"Phi-2",
"https://huggingface.co/microsoft/phi-2"
],
"Parameters": "2.8B",
"Org": "Microsoft",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 49.34
},
"Shopping Knowledge Reasoning": {
"Overall": 42.83
},
"User Behavior Alignment": {
"Overall": 36.38
},
"Multi-lingual Abilities": {
"Overall": 32.91
}
},
"eCeLLM-S": {
"META": {
"Method": [
"eCeLLM-S",
"https://huggingface.co/NingLab/eCeLLM-S"
],
"Parameters": "2.8B",
"Org": "OSU NingLab",
"OpenSource": "Yes",
"Verified": "Yes"
},
"Shopping Concept Understanding": {
"Overall": 49.4
},
"Shopping Knowledge Reasoning": {
"Overall": 39.06
},
"User Behavior Alignment": {
"Overall": 36.33
},
"Multi-lingual Abilities": {
"Overall": 32.79
}
}
}
}