Yilun Jin
commited on
Commit
•
a59a0d0
1
Parent(s):
0302c93
update skill-wise leaderboard description
Browse files- .DS_Store +0 -0
- __pycache__/gen_table.cpython-38.pyc +0 -0
- __pycache__/meta_data.cpython-38.pyc +0 -0
- app.py +2 -4
- gen_table.py +2 -1
- meta_data.py +94 -19
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
__pycache__/gen_table.cpython-38.pyc
CHANGED
Binary files a/__pycache__/gen_table.cpython-38.pyc and b/__pycache__/gen_table.cpython-38.pyc differ
|
|
__pycache__/meta_data.cpython-38.pyc
CHANGED
Binary files a/__pycache__/meta_data.cpython-38.pyc and b/__pycache__/meta_data.cpython-38.pyc differ
|
|
app.py
CHANGED
@@ -20,7 +20,7 @@ with gr.Blocks() as demo:
|
|
20 |
structs = [abc.abstractproperty() for _ in range(N_DATA)]
|
21 |
|
22 |
with gr.Tabs(elem_classes='tab-buttons') as tabs:
|
23 |
-
with gr.TabItem('🏅
|
24 |
gr.Markdown(LEADERBOARD_MD['MAIN'])
|
25 |
_, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
|
26 |
table = generate_table(results, DEFAULT_BENCH)
|
@@ -131,7 +131,7 @@ with gr.Blocks() as demo:
|
|
131 |
interactive=False,
|
132 |
visible=True)
|
133 |
s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
|
134 |
-
|
135 |
def filter_df_l2(dataset_name, fields, model_size, model_type):
|
136 |
s = structs[DATASETS.index(dataset_name)]
|
137 |
headers = ['Rank'] + s.check_box['essential'] + fields
|
@@ -158,8 +158,6 @@ with gr.Blocks() as demo:
|
|
158 |
fn=filter_df_l2,
|
159 |
inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
|
160 |
outputs=s.data_component)
|
161 |
-
print(s)
|
162 |
-
"""
|
163 |
|
164 |
with gr.Row():
|
165 |
with gr.Accordion('Citation', open=False):
|
|
|
20 |
structs = [abc.abstractproperty() for _ in range(N_DATA)]
|
21 |
|
22 |
with gr.Tabs(elem_classes='tab-buttons') as tabs:
|
23 |
+
with gr.TabItem('🏅 Shopping MMLU Leaderboard', elem_id='main', id=0):
|
24 |
gr.Markdown(LEADERBOARD_MD['MAIN'])
|
25 |
_, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
|
26 |
table = generate_table(results, DEFAULT_BENCH)
|
|
|
131 |
interactive=False,
|
132 |
visible=True)
|
133 |
s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
|
134 |
+
|
135 |
def filter_df_l2(dataset_name, fields, model_size, model_type):
|
136 |
s = structs[DATASETS.index(dataset_name)]
|
137 |
headers = ['Rank'] + s.check_box['essential'] + fields
|
|
|
158 |
fn=filter_df_l2,
|
159 |
inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
|
160 |
outputs=s.data_component)
|
|
|
|
|
161 |
|
162 |
with gr.Row():
|
163 |
with gr.Accordion('Citation', open=False):
|
gen_table.py
CHANGED
@@ -123,7 +123,8 @@ def BUILD_L2_DF(results, dataset):
|
|
123 |
print(df)
|
124 |
all_fields = overall_fields + non_overall_fields
|
125 |
# Use the first 5 non-overall fields as required fields
|
126 |
-
required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
|
|
|
127 |
|
128 |
if dataset == 'OCRBench':
|
129 |
df = df.sort_values('Final Score')
|
|
|
123 |
print(df)
|
124 |
all_fields = overall_fields + non_overall_fields
|
125 |
# Use the first 5 non-overall fields as required fields
|
126 |
+
# required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
|
127 |
+
required_fields = all_fields
|
128 |
|
129 |
if dataset == 'OCRBench':
|
130 |
df = df.sort_values('Final Score')
|
meta_data.py
CHANGED
@@ -57,34 +57,109 @@ LEADERBOARD_MD['MAIN'] = f"""
|
|
57 |
|
58 |
|
59 |
LEADERBOARD_MD['Shopping Concept Understanding'] = """
|
60 |
-
##
|
61 |
-
|
62 |
-
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
"""
|
66 |
|
67 |
|
68 |
LEADERBOARD_MD['Shopping Knowledge Reasoning'] = """
|
69 |
-
##
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
"""
|
76 |
|
77 |
LEADERBOARD_MD['User Behavior Alignment'] = """
|
78 |
-
##
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
"""
|
83 |
|
84 |
LEADERBOARD_MD['Multi-lingual Abilities'] = """
|
85 |
-
##
|
86 |
-
|
87 |
-
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
"""
|
90 |
|
|
|
57 |
|
58 |
|
59 |
LEADERBOARD_MD['Shopping Concept Understanding'] = """
|
60 |
+
## Shopping Concept Understanding Evaluation Results
|
61 |
+
|
62 |
+
Online shopping concepts such as brands and product models are domain-specific and not often seen in pre-training. Moreover, they often appear in short texts (e.g. queries, attribute-value pairs) and thus no sufficient contexts are given to help understand them. Hence, failing to understand these concepts compromises the performance of LLMs on downstream tasks.
|
63 |
+
|
64 |
+
The included sub-skills and tasks include:
|
65 |
+
- **Concept Normalization**:
|
66 |
+
- Product Category Synonym
|
67 |
+
- Attribute Value Synonym
|
68 |
+
- **Elaboration**:
|
69 |
+
- Attribute Explanation
|
70 |
+
- Product Category Explanation
|
71 |
+
- **Relational Inference**:
|
72 |
+
- Applicable Attribute to Product Category
|
73 |
+
- Applicable Product Category to Attribute
|
74 |
+
- Inapplicable Attributes
|
75 |
+
- Valid Attribute Value Given Attribute and Product Category
|
76 |
+
- Valid Attribute Given Attribute Value and Product Category
|
77 |
+
- Product Category Classification
|
78 |
+
- Product Category Generation
|
79 |
+
- **Sentiment Analysis**:
|
80 |
+
- Aspect-based Sentiment Classification
|
81 |
+
- Aspect-based Review Retrieval
|
82 |
+
- Aspect-based Review Selection
|
83 |
+
- Aspect-based Reviews Overall Sentiment Classification
|
84 |
+
- **Information Extraction**:
|
85 |
+
- Attribute Value Extraction
|
86 |
+
- Query Named Entity Recognition
|
87 |
+
- Aspect-based Review Keyphrase Selection
|
88 |
+
- Aspect-based Review Keyphrase Extraction
|
89 |
+
- **Summarization**:
|
90 |
+
- Attribute Naming from Decription
|
91 |
+
- Product Category Naming from Description
|
92 |
+
- Review Aspect Retrieval
|
93 |
+
- Single Conversation Topic Selection
|
94 |
+
- Multi-Conversation Topic Retrieval
|
95 |
+
- Product Keyphrase Selection
|
96 |
+
- Product Keyphrase Retrieval
|
97 |
+
- Product Title Generation
|
98 |
"""
|
99 |
|
100 |
|
101 |
LEADERBOARD_MD['Shopping Knowledge Reasoning'] = """
|
102 |
+
## Shopping Knowledge Reasoning Evaluation Results
|
103 |
+
|
104 |
+
This skill focuses on understanding and applying various implicit knowledge to perform reasoning over products and their attributes. For example, calculations such as the total volume of a product pack require numeric reasoning, and finding compatible products requires multi-hop reasoning among various products over a product knowledge graph.
|
105 |
+
|
106 |
+
The included sub-skills and tasks include:
|
107 |
+
- **Numeric Reasoning**:
|
108 |
+
- Unit Conversation
|
109 |
+
- Product Numeric Reasoning
|
110 |
+
- **Commonsense Reasoning**
|
111 |
+
- **Implicit Multi-Hop Reasoning**:
|
112 |
+
- Product Compatibility
|
113 |
+
- Complementary Product Categories
|
114 |
+
- Implicit Attribute Reasoning
|
115 |
+
- Related Brands Selection
|
116 |
+
- Related Brands Retrieval
|
117 |
"""
|
118 |
|
119 |
LEADERBOARD_MD['User Behavior Alignment'] = """
|
120 |
+
## User Behavior Alignment Evaluation Results
|
121 |
+
|
122 |
+
Accurately modeling user behaviors is a crucial skill in online shopping. A large variety of user behaviors exist in online shopping, including queries, clicks, add-to-carts, purchases, etc. Moreover, these behaviors are generally implicit and not expressed in text.
|
123 |
+
|
124 |
+
Consequently, LLMs trained with general texts encounter challenges in aligning with the heterogeneous and implicit user behaviors as they rarely observe such inputs during pre-training.
|
125 |
+
|
126 |
+
The included sub-skills and tasks include:
|
127 |
+
- **Query-Query Relations**:
|
128 |
+
- Query Re-Writing
|
129 |
+
- Query-Query Intention Selection
|
130 |
+
- Intention-Based Related Query Retrieval
|
131 |
+
- **Query-Product Relations**:
|
132 |
+
- Product Category Selection for Query
|
133 |
+
- Query-Product Relation Selection
|
134 |
+
- Query-Product Ranking
|
135 |
+
- **Sessions**:
|
136 |
+
- Session-based Query Recommendation
|
137 |
+
- Session-based Next Query Selection
|
138 |
+
- Session-based Next Product Selection
|
139 |
+
- **Purchases**:
|
140 |
+
- Product Co-Purchase Selection
|
141 |
+
- Product Co-Purchase Retrieval
|
142 |
+
- **Reviews and QA**:
|
143 |
+
- Review Rating Prediction
|
144 |
+
- Aspect-Sentiment-Based Review Generation
|
145 |
+
- Review Helpfulness Selection
|
146 |
+
- Product-Based Question Answering
|
147 |
"""
|
148 |
|
149 |
LEADERBOARD_MD['Multi-lingual Abilities'] = """
|
150 |
+
## Multi-lingual Abilities Evaluation Results
|
151 |
+
|
152 |
+
Multi-lingual models are desired in online shopping as they can be deployed in multiple marketplaces without re-training.
|
153 |
+
|
154 |
+
The included sub-skills and tasks include:
|
155 |
+
- **Multi-lingual Shopping Concept Understanding**:
|
156 |
+
- Multi-lingual Product Title Generation
|
157 |
+
- Multi-lingual Product Keyphrase Selection
|
158 |
+
- Cross-lingual Product Title Translation
|
159 |
+
- Cross-lingual Product Entity Alignment
|
160 |
+
- **Multi-lingual User Behavior Alignment**:
|
161 |
+
- Multi-lingual Query-product Relation Selection
|
162 |
+
- Multi-lingual Query-product Ranking
|
163 |
+
- Multi-lingual Session-based Product Recommendation
|
164 |
"""
|
165 |
|