shopping_mmlu_leaderboard

Running

App Files Files Community

Yilun Jin commited on Oct 31, 2024

Commit

a59a0d0

•

1 Parent(s): 0302c93

update skill-wise leaderboard description

Browse files

Files changed (6) hide show

.DS_Store +0 -0
__pycache__/gen_table.cpython-38.pyc +0 -0
__pycache__/meta_data.cpython-38.pyc +0 -0
app.py +2 -4
gen_table.py +2 -1
meta_data.py +94 -19

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

__pycache__/gen_table.cpython-38.pyc CHANGED Viewed

Binary files a/__pycache__/gen_table.cpython-38.pyc and b/__pycache__/gen_table.cpython-38.pyc differ

__pycache__/meta_data.cpython-38.pyc CHANGED Viewed

Binary files a/__pycache__/meta_data.cpython-38.pyc and b/__pycache__/meta_data.cpython-38.pyc differ

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ with gr.Blocks() as demo:
     structs = [abc.abstractproperty() for _ in range(N_DATA)]
     with gr.Tabs(elem_classes='tab-buttons') as tabs:
-        with gr.TabItem('🏅 OpenVLM Main Leaderboard', elem_id='main', id=0):
             gr.Markdown(LEADERBOARD_MD['MAIN'])
             _, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
             table = generate_table(results, DEFAULT_BENCH)
@@ -131,7 +131,7 @@ with gr.Blocks() as demo:
                     interactive=False,
                     visible=True)
                 s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
-                """
                 def filter_df_l2(dataset_name, fields, model_size, model_type):
                     s = structs[DATASETS.index(dataset_name)]
                     headers = ['Rank'] + s.check_box['essential'] + fields
@@ -158,8 +158,6 @@ with gr.Blocks() as demo:
                         fn=filter_df_l2,
                         inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
                         outputs=s.data_component)
-                print(s)
-                """
     with gr.Row():
         with gr.Accordion('Citation', open=False):

     structs = [abc.abstractproperty() for _ in range(N_DATA)]
     with gr.Tabs(elem_classes='tab-buttons') as tabs:
+        with gr.TabItem('🏅 Shopping MMLU Leaderboard', elem_id='main', id=0):
             gr.Markdown(LEADERBOARD_MD['MAIN'])
             _, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
             table = generate_table(results, DEFAULT_BENCH)
                     interactive=False,
                     visible=True)
                 s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
                 def filter_df_l2(dataset_name, fields, model_size, model_type):
                     s = structs[DATASETS.index(dataset_name)]
                     headers = ['Rank'] + s.check_box['essential'] + fields
                         fn=filter_df_l2,
                         inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
                         outputs=s.data_component)
     with gr.Row():
         with gr.Accordion('Citation', open=False):

gen_table.py CHANGED Viewed

@@ -123,7 +123,8 @@ def BUILD_L2_DF(results, dataset):
     print(df)
     all_fields = overall_fields + non_overall_fields
     # Use the first 5 non-overall fields as required fields
-    required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
     if dataset == 'OCRBench':
         df = df.sort_values('Final Score')

     print(df)
     all_fields = overall_fields + non_overall_fields
     # Use the first 5 non-overall fields as required fields
+    # required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
+    required_fields = all_fields
     if dataset == 'OCRBench':
         df = df.sort_values('Final Score')

meta_data.py CHANGED Viewed

@@ -57,34 +57,109 @@ LEADERBOARD_MD['MAIN'] = f"""
 LEADERBOARD_MD['Shopping Concept Understanding'] = """
-## MMVet Evaluation Results
-- In MMVet Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
-- No specific prompt template adopted for **ALL VLMs**.
-- We also provide performance on the [**Official Leaderboard**](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet) for models that are applicable. Those results are obtained with GPT-4-0314 evaluator (which has been deperacted for new users).
 """
 LEADERBOARD_MD['Shopping Knowledge Reasoning'] = """
-## MMMU TestMini Evaluation Results
-- We report the evaluation results on MathVista **TestMini**, which include 1000 test samples.
-- We adopt `GPT-4-Turbo (1106)` as the answer extractor when we failed to extract the answer with heuristic matching.
-- The performance of **Human  (High school)** and **Random Choice** are copied from the official leaderboard.
-**Category Definitions:** **FQA:** figure QA, **GPS:** geometry problem solving, **MWP:** math word problem, **TQA:** textbook QA, **VQA:** visual QA, **ALG:** algebraic, **ARI:** arithmetic, **GEO:** geometry, **LOG:** logical , **NUM:** numeric, **SCI:** scientific, **STA:** statistical.
 """
 LEADERBOARD_MD['User Behavior Alignment'] = """
-## OCRBench Evaluation Results
-- The evaluation of OCRBench is implemented by the official team: https://github.com/Yuliang-Liu/MultimodalOCR.
-- The performance of GPT4V might be underestimated: GPT4V rejects to answer 12 percent of the questions due to the policy of OpenAI. For those questions, the returned answer is "Your input image may contain content that is not allowed by our safety system."
 """
 LEADERBOARD_MD['Multi-lingual Abilities'] = """
-## MMStar Evaluation Results
-- MMStar is an elite vision-indispensable multi-modal benchmark, including 1,500 challenging samples meticulously selected by humans.
-- During the evaluation of MMStar, we find that some API models may reject to answer some of the questions. Currently, we treat such cases as wrong answers when reporting the results.
 """

 LEADERBOARD_MD['Shopping Concept Understanding'] = """
+## Shopping Concept Understanding Evaluation Results
+Online shopping concepts such as brands and product models are domain-specific and not often seen in pre-training. Moreover, they often appear in short texts (e.g. queries, attribute-value pairs) and thus no sufficient contexts are given to help understand them. Hence, failing to understand these concepts compromises the performance of LLMs on downstream tasks.
+The included sub-skills and tasks include:
+- **Concept Normalization**:
+  - Product Category Synonym
+  - Attribute Value Synonym
+- **Elaboration**:
+  - Attribute Explanation
+  - Product Category Explanation
+- **Relational Inference**:
+  - Applicable Attribute to Product Category
+  - Applicable Product Category to Attribute
+  - Inapplicable Attributes
+  - Valid Attribute Value Given Attribute and Product Category
+  - Valid Attribute Given Attribute Value and Product Category
+  - Product Category Classification
+  - Product Category Generation
+- **Sentiment Analysis**:
+  - Aspect-based Sentiment Classification
+  - Aspect-based Review Retrieval
+  - Aspect-based Review Selection
+  - Aspect-based Reviews Overall Sentiment Classification
+- **Information Extraction**:
+  - Attribute Value Extraction
+  - Query Named Entity Recognition
+  - Aspect-based Review Keyphrase Selection
+  - Aspect-based Review Keyphrase Extraction
+- **Summarization**:
+  - Attribute Naming from Decription
+  - Product Category Naming from Description
+  - Review Aspect Retrieval
+  - Single Conversation Topic Selection
+  - Multi-Conversation Topic Retrieval
+  - Product Keyphrase Selection
+  - Product Keyphrase Retrieval
+  - Product Title Generation
 """
 LEADERBOARD_MD['Shopping Knowledge Reasoning'] = """
+## Shopping Knowledge Reasoning Evaluation Results
+This skill focuses on understanding and applying various implicit knowledge to perform reasoning over products and their attributes. For example, calculations such as the total volume of a product pack require numeric reasoning, and finding compatible products requires multi-hop reasoning among various products over a product knowledge graph.
+The included sub-skills and tasks include:
+- **Numeric Reasoning**:
+  - Unit Conversation
+  - Product Numeric Reasoning
+- **Commonsense Reasoning**
+- **Implicit Multi-Hop Reasoning**:
+  - Product Compatibility
+  - Complementary Product Categories
+  - Implicit Attribute Reasoning
+  - Related Brands Selection
+  - Related Brands Retrieval
 """
 LEADERBOARD_MD['User Behavior Alignment'] = """
+## User Behavior Alignment Evaluation Results
+Accurately modeling user behaviors is a crucial skill in online shopping. A large variety of user behaviors exist in online shopping, including queries, clicks, add-to-carts, purchases, etc. Moreover, these behaviors are generally implicit and not expressed in text.
+Consequently, LLMs trained with general texts encounter challenges in aligning with the heterogeneous and implicit user behaviors as they rarely observe such inputs during pre-training.
+The included sub-skills and tasks include:
+- **Query-Query Relations**:
+  - Query Re-Writing
+  - Query-Query Intention Selection
+  - Intention-Based Related Query Retrieval
+- **Query-Product Relations**:
+  - Product Category Selection for Query
+  - Query-Product Relation Selection
+  - Query-Product Ranking
+- **Sessions**:
+  - Session-based Query Recommendation
+  - Session-based Next Query Selection
+  - Session-based Next Product Selection
+- **Purchases**:
+  - Product Co-Purchase Selection
+  - Product Co-Purchase Retrieval
+- **Reviews and QA**:
+  - Review Rating Prediction
+  - Aspect-Sentiment-Based Review Generation
+  - Review Helpfulness Selection
+  - Product-Based Question Answering
 """
 LEADERBOARD_MD['Multi-lingual Abilities'] = """
+## Multi-lingual Abilities Evaluation Results
+Multi-lingual models are desired in online shopping as they can be deployed in multiple marketplaces without re-training.
+The included sub-skills and tasks include:
+- **Multi-lingual Shopping Concept Understanding**:
+  - Multi-lingual Product Title Generation
+  - Multi-lingual Product Keyphrase Selection
+  - Cross-lingual Product Title Translation
+  - Cross-lingual Product Entity Alignment
+- **Multi-lingual User Behavior Alignment**:
+  - Multi-lingual Query-product Relation Selection
+  - Multi-lingual Query-product Ranking
+  - Multi-lingual Session-based Product Recommendation
 """