| <!DOCTYPE html> |
| <html> |
|
|
| <head> |
| <title>FeatureSelect Leaderboard</title> |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"> |
| <link rel="icon" href="https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/AlpacaFarm_small.png"> |
| <link href="https://cdn.jsdelivr.net/css-toggle-switch/latest/toggle-switch.css" rel="stylesheet"/> |
|
|
| <style> |
| body { |
| font-family: Arial, sans-serif; |
| margin: 0; |
| padding: 50px 20px; |
| background-color: #FFFFFF; |
| color: #000000; |
| } |
| |
| .container { |
| max-width: 700px; |
| margin: auto; |
| } |
| |
| #branding { |
| text-align: center; |
| margin-bottom: 20px; |
| } |
| |
| #branding h1 { |
| margin: 0; |
| font-size: 2em; |
| } |
| |
| h2 { |
| margin: 0; |
| font-size: 1.2em; |
| color: #777; |
| } |
| |
| table { |
| max-width: 700px; |
| width: 100%; |
| table-layout: fixed; |
| margin: auto; |
| font-size: 1em; |
| } |
| |
| table th, |
| table td { |
| padding: 6px; |
| word-wrap: break-word; |
| vertical-align: middle; |
| } |
| |
| table th { |
| border-bottom: 2px solid #000; |
| } |
| |
| th.rank, td.rank { |
| width: 9%; |
| padding-left: 10px; |
| text-align: left; |
| } |
| |
| th.name, td.name { |
| width: 55%; |
| padding-left: 30px; |
| text-align: left; |
| } |
| |
| th:not(.rank):not(.name), |
| td:not(.rank):not(.name) { |
| text-align: right; |
| padding-right: 10px; |
| } |
| |
| th.winRate, td.winRate { |
| width: 17%; |
| padding-right: 30px; |
| } |
| |
| th { |
| text-align: right; |
| padding-bottom: 15px; |
| } |
| |
| td { |
| padding-bottom: 10px; |
| } |
| |
| #leaderboard tr th.winRate, |
| #leaderboard tr td.winRate { |
| color: #999999; |
| } |
| |
| #leaderboard tr th.rank, |
| #leaderboard tr td.rank { |
| color: #999999; |
| } |
| |
| table tr:nth-child(even) { |
| background-color: #E8E8E8; |
| } |
| |
| table tr:nth-child(odd) { |
| background-color: #F8F8F8; |
| } |
| |
| .switch-toggle { |
| display: inline-block; |
| vertical-align: middle; |
| } |
| |
| .switch-toggle input + label { |
| padding: 2px; |
| padding-left: 7px; |
| padding-right: 7px; |
| cursor: pointer; |
| background-color: lightgrey; |
| border: 1px solid transparent; |
| font-size: 16px; |
| } |
| |
| .switch-toggle input:checked + label { |
| border-color: green; |
| color: green; |
| } |
| |
| .switch-toggle input:not(:checked) + label { |
| color: black; |
| box-shadow: none !important; |
| user-select: none; |
| } |
| |
| |
| .toggle-line { |
| display: flex; |
| justify-content: center; |
| align-items: center; |
| margin-bottom: 20px; |
| font-size: 17px; |
| } |
| |
| .toggle-line .switch-toggle { |
| margin: 0 10px; |
| } |
| </style> |
| |
| </head> |
|
|
| <body> |
| <div class="container"> |
| <div id="branding"> |
|
|
| <h1>FeatureSelect |
| |
| |
| |
| Leaderboard |
| </h1> |
| <br> |
| <h2>An Automatic Evaluator for FeatureSelect Methods</h2> |
| |
| |
| |
| |
| <small id="caution" style="color: #8C1515;"> |
| <b> Length-controlled</b> (LC) win rates alleviate length biases of GPT-4, but it may favor models finetuned on its outputs. |
| </small> |
| <br> |
| <a href="https://github.com/Fss2652530458/AutoFS"> |
| <img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub logo" style="height: 1.5em;/* margin-bottom: 0; */"> |
| </a> |
| </div> |
|
|
|
|
| |
| <div class="toggle-line"> |
|
|
| Version: |
| <div class="switch-toggle switch-evaluator" style="margin-right: 4em"> |
| <input id="alpaca_eval" name="version" type="radio"/> |
| <label for="alpaca_eval" onclick="">AlpacaEval</label> |
| <input id="alpaca_eval_2" name="version" type="radio" checked="checked"/> |
| <label for="alpaca_eval_2" onclick="">AlpacaEval 2.0</label> |
| </div> |
|
|
| Filter: |
| <div class="switch-toggle switch-filter"> |
| <input id="community" name="filter" type="radio"/> |
| <label for="community" onclick="">Community</label> |
| <input id="verified" name="filter" type="radio" checked="checked"/> |
| <label for="verified" onclick="">Verified</label> |
| |
| |
| </div> |
|
|
|
|
|
|
| </div> |
| |
| <div class="container" style="text-align: center; margin-bottom: 10px; margin-top: -10px;"> |
| <small id="alpaca_eval_info" style="color: #777;"> |
| Baseline: GPT-4 Preview (11/06) | Auto-annotator: GPT-4 Preview (11/06) |
| </small> |
| </div> |
|
|
|
|
| |
| <table id="leaderboard"> |
| <tr> |
| <th class="rank">Rank</th> |
| <th class="name" onclick="sortTable('algorithm')">Algorithm <span id="arrow-algorithm">↕</span></th> |
| <th class="lenWinRate" onclick="sortTable('num_features')">#Features <span id="arrow-num_features">↕</span></th> |
| <th class="winRate" onclick="sortTable('mean_f1')">Mean F1 <span id="arrow-mean_f1">↕</span></th> |
| <th class="winRate" onclick="sortTable('mean_auc')">Mean AUC <span id="arrow-mean_auc">↕</span></th> |
| <th class="winRate" onclick="sortTable('time')">Time (s) <span id="arrow-time">↕</span></th> |
| </tr> |
| </table> |
|
|
| |
| <div id="documentation"> |
| <div style="text-align: center;"> |
| <a href="https://github.com/tatsu-lab/alpaca_eval" style="display: inline-block;"> |
| <i class="fab fa-fw fa-github" aria-hidden="true"></i> Github |
| </a> |
| </div> |
| <br> |
| <h2>About AlpacaEval</h2> |
| <p> |
| <a href="https://github.com/tatsu-lab/alpaca_eval" target="_blank">AlpacaEval</a> |
| an LLM-based automatic evaluation that is fast, cheap, and reliable. |
| It is based on the |
| <a href="https://crfm.stanford.edu/2023/05/22/alpaca-farm.html">AlpacaFarm</a> |
| evaluation set, |
| which tests the ability of models to follow general user instructions. |
| These responses are then compared to reference responses (Davinci003 for AlpacaEval, GPT-4 Preview for AlpacaEval 2.0) by |
| the provided GPT-4 based auto-annotators, |
| which results in the win rates presented above. |
| AlpacaEval displays a high agreement rate with ground truth human annotations, |
| and leaderboard rankings on AlpacaEval are very correlated with leaderboard rankings |
| based on human annotators. |
| Please see our |
| <a href="https://github.com/tatsu-lab/alpaca_eval#analysis" target="_blank">documentation</a> |
| for more details on our analysis. |
| </p> |
| <h2>Adding new models</h2> |
| <p> |
| We welcome new model contributions to the leaderboard from the community! |
| To do so, please follow the steps in the |
| <a href="https://github.com/tatsu-lab/alpaca_eval#contributing" target="_blank">contributions |
| section</a>. |
| Specifically, you'll need to run the model on the evaluation set, |
| auto-annotate the outputs, and submit a PR with the model config and leaderboard results. |
| We've also set up a |
| <a href="https://discord.gg/GJMxJSVZZM" target="_blank">Discord</a> |
| for community support and discussion. |
| </p> |
| <h2>Adding new evaluators or eval sets </h2> |
| <p> |
| We also welcome contributions for new evaluators or new eval sets! |
| For making new evaluators, we release our ground-truth |
| <a href="https://github.com/tatsu-lab/alpaca_eval#data-release" target="_blank">human annotations</a> |
| and <a href="https://github.com/tatsu-lab/alpaca_eval#analyzing-an-evaluator" target="_blank">comparison |
| metrics</a>. |
| We also release a |
| <a href="https://github.com/tatsu-lab/alpaca_eval#analyzing-an-eval-set" target="_blank">rough guide</a> |
| to follow for making new eval sets. |
| We specifically encourage contributions for harder instructions distributions and for safety testing of |
| LLMs. |
| </p> |
| <h2>AlpacaEval limitations</h2> |
| <p> |
| 这里是简介 |
| </p> |
| </div> |
|
|
| </div> |
|
|
| <script> |
| const alpacaEvalRadio = document.getElementById('alpaca_eval'); |
| const alpacaEval2Radio = document.getElementById('alpaca_eval_2'); |
| |
| const communityRadio = document.getElementById('community'); |
| const verifiedRadio = document.getElementById('verified'); |
| |
| |
| const table = document.getElementById('leaderboard'); |
| |
| const urls = { |
| 'alpaca_eval': 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval/alpaca_eval_gpt4_leaderboard.csv', |
| 'alpaca_eval_2': 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv', |
| |
| } |
| |
| let currentUrl = urls['alpaca_eval_2']; |
| |
| function updateTable(url) { |
| while (table.rows.length > 1) { |
| table.deleteRow(1); |
| } |
| |
| Papa.parse(url, { |
| download: true, |
| header: true, |
| complete: function (results) { |
| console.log(results.data); |
| let rank = 0; |
| results.data.forEach(row => { |
| if (row['name'] || row['win_rate'] || row['length_controlled_winrate']) { |
| let filter = row['filter']; |
| |
| if ((communityRadio.checked && (filter === 'verified' || filter === 'minimal' || filter === 'community')) || |
| (verifiedRadio.checked && (filter === 'verified' || filter === 'minimal'))) { |
| |
| const tr = document.createElement('tr'); |
| const rankTd = document.createElement('td'); |
| const nameTd = document.createElement('td'); |
| const winRateTd = document.createElement('td'); |
| |
| const lenWinRateTd = document.createElement('td'); |
| |
| rankTd.classList.add('rank'); |
| nameTd.classList.add('name'); |
| winRateTd.classList.add('winRate'); |
| lenWinRateTd.classList.add('lenWinRate'); |
| |
| |
| rank++; |
| rankTd.textContent = rank; |
| |
| if (row['link'] && row['link'].trim() !== '') { |
| const a = document.createElement('a'); |
| a.textContent = row['name']; |
| a.href = row['link']; |
| a.target = "_blank"; |
| nameTd.appendChild(a); |
| } else { |
| nameTd.textContent = row['name']; |
| } |
| |
| |
| if (row['samples'] && row['samples'].trim() !== '') { |
| const samplesLink = document.createElement('a'); |
| samplesLink.textContent = " 📄"; |
| samplesLink.href = row['samples']; |
| samplesLink.target = "_blank"; |
| samplesLink.style.textDecoration = "none"; |
| nameTd.appendChild(samplesLink); |
| } |
| |
| winRateTd.textContent = Number(row['win_rate']).toFixed(1) + '%'; |
| |
| if (row['length_controlled_winrate'] === '') { |
| lenWinRateTd.textContent = 'N/A'; |
| } else { |
| lenWinRateTd.textContent = Number(row['length_controlled_winrate']).toFixed(1) + '%'; |
| } |
| |
| |
| |
| |
| tr.appendChild(rankTd); |
| tr.appendChild(nameTd); |
| tr.appendChild(lenWinRateTd); |
| tr.appendChild(winRateTd); |
| |
| |
| table.appendChild(tr); |
| } |
| } |
| }); |
| } |
| }); |
| } |
| |
| function updateInfoMessage(version) { |
| let infoText; |
| if (version === 'alpaca_eval_2') { |
| infoText = 'Baseline: GPT-4 Preview (11/06) | Auto-annotator: GPT-4 Preview (11/06)'; |
| } else if (version === 'alpaca_eval') { |
| infoText = 'Baseline: Davinci003 | Auto-annotator: GPT-4'; |
| } |
| document.getElementById('alpaca_eval_info').innerHTML = infoText; |
| } |
| |
| updateTable(urls['alpaca_eval_2']); |
| |
| alpacaEval2Radio.addEventListener('click', function () { |
| currentUrl = urls['alpaca_eval_2']; |
| updateTable(currentUrl); |
| updateInfoMessage('alpaca_eval_2'); |
| }); |
| |
| alpacaEvalRadio.addEventListener('click', function () { |
| currentUrl = urls['alpaca_eval']; |
| updateTable(currentUrl); |
| updateInfoMessage('alpaca_eval'); |
| }); |
| |
| communityRadio.addEventListener('click', function () { |
| updateTable(currentUrl); |
| }); |
| |
| verifiedRadio.addEventListener('click', function () { |
| updateTable(currentUrl); |
| }); |
| |
| |
| |
| |
| |
| updateCautionMessage('alpaca_eval_2'); |
| </script> |
|
|
|
|
| </body> |
|
|
| </html> |