Spaces:

allenai
/

ZebraLogic

Running

File size: 7,466 Bytes

from pathlib import Path
from collections import OrderedDict

DEFAULT_K = "∞"
# DEFAULT_K = "1500"

banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'

# TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
  

CITATION_TEXT = """

@misc{zebralogicbench2024,
    title={ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models},
    author={Bill Yuchen Lin, Ronan Le Bras, Yejin Choi},
    url={https://hf.co/spaces/allenai/ZebraLogicBench-Leaderboard},
    year={2024}
}

@article{dziri2024faith,
  title={Faith and fate: Limits of transformers on compositionality},
  author={Nouha Dziri and Ximing Lu and Melanie Sclar and Xiang Lorraine Li and Liwei Jian and Bill Yuchen Lin and Peter West and Chandra Bhagavatula and Ronan Le Bras and Jena D. Hwang and Soumya Sanyal and Sean Welleck and Xiang Ren and Allyson Ettinger and Za{\"i}d Harchaoui and Yejin Choi},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  year={2024}
}
 
"""

# make column_names as an ordered dict
 


column_names = OrderedDict({
    "Model": "Model",   
    "Mode": "Mode",
    "Puzzle Acc": "Puzzle Acc",
    "Cell Acc": "Cell Acc",
    "No answer": "No answer",
    "Easy Puzzle Acc": "Easy Puzzle Acc",
    "Hard Puzzle Acc": "Hard Puzzle Acc",
    # "Total Puzzles": "Total Puzzles",
    # "Reason Lens": "Reason Lens",
})



LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.  
"""

# **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
# The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three. 
# **WB Score** individually scores each model based on checklists.
# Evaluator is GPT-4-Turbo.
LEADERBOARD_REMARKS_MAIN = """ 
"""
  
RANKING_COLUMN = "Puzzle Acc"

ORDERED_COLUMN_NAMES = [
    "Model", 
    "Mode",
    "Puzzle Acc",
    "Easy Puzzle Acc",
    "Hard Puzzle Acc",
    "Cell Acc",
    "No answer", 
]

 
js_light = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'light') {
        url.searchParams.set('__theme', 'light');
        window.location.href = url.href;
    }

    // Find the fieldset with the given id
    const fieldset = document.getElementById("rank-column-radio");

    // Create a new span element with the text "Decoding Mode:"
    const rankBySpan = document.createElement("span");
    rankBySpan.textContent = "Decoding Mode: ";
    rankBySpan.style.fontWeight = "bold"; // Optional: make the text bold
     rankBySpan.style.fontSize = "19px"; // Larger font size
    rankBySpan.style.paddingRight = "18px"; // Add padding on the right

    // Wrap the span and the labels in a flex container
    const flexContainer = document.createElement("div");
    flexContainer.style.display = "flex";
    flexContainer.style.alignItems = "center";

    // Insert the rankBySpan at the beginning of the flex container
    flexContainer.appendChild(rankBySpan);

    // Move all existing labels into the flex container
    while (fieldset.firstChild) {
        flexContainer.appendChild(fieldset.firstChild);
    }

    // Append the flex container back to the fieldset
    fieldset.appendChild(flexContainer);
}
"""

js_code = """
function scroll_top() {
    console.log("Hello from Gradio!");  
    const bubbles = document.querySelectorAll('.bubble-wrap');
    bubbles.forEach((bubble, index) => {
        setTimeout(() => {
            bubble.scrollTop = 0;
        }, index * 100); // Delay of 100ms between each iteration
    });
    
} 
"""


TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)"

css = """

 

code {
    font-size: large;
}
footer {visibility: hidden}
.top-left-LP{
    margin-top: 6px;
    margin-left: 5px;
}
.no_margin{
    margin-top: 0px;
    margin-left: 0px;
    margin-right: 0px;
    margin-bottom: 0px;
    padding-top: 0px;
    padding-left: 0px;
    padding-right: 0px;
    padding-bottom: 0px;
}
.markdown-text{font-size: 14pt}
.markdown-text-tiny{font-size: 10pt}
.markdown-text-small{font-size: 13pt}
.markdown-text-tiny{font-size: 12pt}
.markdown-text-tiny-red{
    font-size: 12pt;
    color: red;
    background-color: yellow;
    font-color: red;
    font-weight: bold;
}
th {
  text-align: center;
  font-size: 17px; /* Adjust the font size as needed */
}
td {
  font-size: 15px; /* Adjust the font size as needed */
  text-align: center;
}

.sample_button{
    border: 2px solid #000000;
    border-radius: 10px;
    padding: 10px;
    font-size: 17pt;
    font-weight: bold;
    margin: 5px;
    background-color: #D8BFD8;
}

.chat-common{
    height: auto;
    max-height: 400px;
    min-height: 100px; 
}
.chat-specific{
    height: auto;
    max-height: 600px;
    min-height: 200px; 
}
#od-benchmark-tab-table-button{
    font-size: 15pt;
    font-weight: bold;
} 

.btn_boderline{
    border: 1px solid #000000;
    border-radius: 5px;
    padding: 5px;
    margin: 5px;
    font-size: 15pt;
    font-weight: bold; 
}

.btn_boderline_next{
    border: 0.1px solid #000000;
    border-radius: 5px;
    padding: 5px;
    margin: 5px;
    font-size: 15pt;
    font-weight: bold; 
}

.btn_boderline_gray{
    border: 0.5px solid gray;
    border-radius: 5px;
    padding: 5px;
    margin: 5px;
    font-size: 15pt;
    font-weight: italic; 
}
.btn_boderline_selected{
    border: 2px solid purple;
    background-color: #f2f2f2;
    border-radius: 5px;
    padding: 5px;
    margin: 5px;
    font-size: 15pt;
    font-weight: bold;  
}
.accordion-label button span{
    font-size: 14pt;
    font-weight: bold;
} 

#show-task-categorized span{
    font-size: 13pt;
    font-weight: bold;
}

#show-open-source-models span{
    font-size: 13pt;
    font-weight: bold;
}

#select-models span{
    font-size: 10pt;
}

#select-tasks span{
    font-size: 10pt;
}


.markdown-text-details{
    margin: 10px;
    padding: 10px;
}


button.selected[role="tab"][aria-selected="true"] {
    font-size: 18px; /* or any other size you prefer */
    font-weight: bold;
}

#od-benchmark-tab-table-ablation-button {
    font-size: larger; /* Adjust the font size as needed */
}


.plotly-plot{
    height: auto;
    max-height: 600px;
    min-height: 600px; 
}

#length-margin-radio{
    font-size: 10pt;
    # padding: 0px;
    # margin: 1px;
}

#show-task-categorized{
    font-size: 12pt; 
    font-decoration: bold;
}

#show-open-source-models{
    font-size: 12pt; 
    font-decoration: bold;
}

.box_md{
    border: 1px solid #000000;
    border-radius: 10px;
    padding: 10px;
    font-size: 12pt;
    margin: 5px;
}
"""