Spaces:
Running
Running
Upload from GitHub Actions: Use task subset for average score
Browse files
evals/backend.py
CHANGED
|
@@ -28,6 +28,8 @@ task_metrics = [
|
|
| 28 |
"mgsm_accuracy",
|
| 29 |
]
|
| 30 |
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def compute_normalized_average(df, metrics):
|
| 33 |
"""Compute average of min-max normalized metric columns."""
|
|
@@ -52,7 +54,7 @@ def make_model_table(df, models):
|
|
| 52 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 53 |
df = df.drop(columns=["task", "metric"])
|
| 54 |
df = df.pivot(index="model", columns="task_metric", values="score")
|
| 55 |
-
df["average"] = compute_normalized_average(df,
|
| 56 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 57 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 58 |
df["rank"] = df.index + 1
|
|
@@ -84,7 +86,7 @@ def make_language_table(df, languages):
|
|
| 84 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 85 |
df = df.drop(columns=["task", "metric"])
|
| 86 |
df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
|
| 87 |
-
df["average"] = compute_normalized_average(df,
|
| 88 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 89 |
df = df.sort_values(by="speakers", ascending=False)
|
| 90 |
df = df[
|
|
|
|
| 28 |
"mgsm_accuracy",
|
| 29 |
]
|
| 30 |
|
| 31 |
+
task_metrics_basic = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
|
| 32 |
+
|
| 33 |
|
| 34 |
def compute_normalized_average(df, metrics):
|
| 35 |
"""Compute average of min-max normalized metric columns."""
|
|
|
|
| 54 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 55 |
df = df.drop(columns=["task", "metric"])
|
| 56 |
df = df.pivot(index="model", columns="task_metric", values="score")
|
| 57 |
+
df["average"] = compute_normalized_average(df, task_metrics_basic)
|
| 58 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 59 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 60 |
df["rank"] = df.index + 1
|
|
|
|
| 86 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 87 |
df = df.drop(columns=["task", "metric"])
|
| 88 |
df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
|
| 89 |
+
df["average"] = compute_normalized_average(df, task_metrics_basic)
|
| 90 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 91 |
df = df.sort_values(by="speakers", ascending=False)
|
| 92 |
df = df[
|
frontend/src/components/LanguagePlot.js
CHANGED
|
@@ -3,38 +3,38 @@ import * as Plot from '@observablehq/plot'
|
|
| 3 |
|
| 4 |
const LanguagePlot = ({ data, width = 750, height = 500 }) => {
|
| 5 |
const containerRef = useRef()
|
| 6 |
-
const languages = data.language_table.filter(a => a.
|
| 7 |
const families = [...new Set(languages.map(a => a.family))]
|
| 8 |
|
| 9 |
useEffect(() => {
|
| 10 |
const plot = Plot.plot({
|
| 11 |
width: width,
|
| 12 |
height: height,
|
| 13 |
-
subtitle: '
|
| 14 |
x: {
|
| 15 |
label: 'Number of Speakers',
|
| 16 |
type: 'log'
|
| 17 |
},
|
| 18 |
y: {
|
| 19 |
-
label: '
|
| 20 |
},
|
| 21 |
marks: [
|
| 22 |
Plot.dot(languages, {
|
| 23 |
x: 'speakers',
|
| 24 |
-
y: d => d.
|
| 25 |
r: 'speakers',
|
| 26 |
fill: 'family',
|
| 27 |
title: d =>
|
| 28 |
`${d.language_name}\n${d.speakers.toLocaleString('en-US', {
|
| 29 |
notation: 'compact'
|
| 30 |
-
})} speakers\nScore: ${d.
|
| 31 |
tip: true
|
| 32 |
}),
|
| 33 |
Plot.text(
|
| 34 |
languages.filter(a => a.speakers > 1e8),
|
| 35 |
{
|
| 36 |
x: 'speakers',
|
| 37 |
-
y: d => d.
|
| 38 |
text: d => d.language_name,
|
| 39 |
fill: 'black',
|
| 40 |
frameAnchor: 'left',
|
|
|
|
| 3 |
|
| 4 |
const LanguagePlot = ({ data, width = 750, height = 500 }) => {
|
| 5 |
const containerRef = useRef()
|
| 6 |
+
const languages = data.language_table.filter(a => a.average > 0)
|
| 7 |
const families = [...new Set(languages.map(a => a.family))]
|
| 8 |
|
| 9 |
useEffect(() => {
|
| 10 |
const plot = Plot.plot({
|
| 11 |
width: width,
|
| 12 |
height: height,
|
| 13 |
+
subtitle: 'Proficiency scores by language',
|
| 14 |
x: {
|
| 15 |
label: 'Number of Speakers',
|
| 16 |
type: 'log'
|
| 17 |
},
|
| 18 |
y: {
|
| 19 |
+
label: 'Language proficiency score'
|
| 20 |
},
|
| 21 |
marks: [
|
| 22 |
Plot.dot(languages, {
|
| 23 |
x: 'speakers',
|
| 24 |
+
y: d => d.average,
|
| 25 |
r: 'speakers',
|
| 26 |
fill: 'family',
|
| 27 |
title: d =>
|
| 28 |
`${d.language_name}\n${d.speakers.toLocaleString('en-US', {
|
| 29 |
notation: 'compact'
|
| 30 |
+
})} speakers\nScore: ${d.average.toFixed(2)}`,
|
| 31 |
tip: true
|
| 32 |
}),
|
| 33 |
Plot.text(
|
| 34 |
languages.filter(a => a.speakers > 1e8),
|
| 35 |
{
|
| 36 |
x: 'speakers',
|
| 37 |
+
y: d => d.average,
|
| 38 |
text: d => d.language_name,
|
| 39 |
fill: 'black',
|
| 40 |
frameAnchor: 'left',
|
frontend/src/components/ScoreColumns.js
CHANGED
|
@@ -13,8 +13,8 @@ const scoreBodyTemplate = (field, options = {}) => {
|
|
| 13 |
const ScoreColumns = [
|
| 14 |
<Column
|
| 15 |
field='average'
|
| 16 |
-
header='
|
| 17 |
-
headerTooltip='Language Proficiency Score (average
|
| 18 |
sortable
|
| 19 |
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
| 20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
|
|
|
| 13 |
const ScoreColumns = [
|
| 14 |
<Column
|
| 15 |
field='average'
|
| 16 |
+
header='Proficiency'
|
| 17 |
+
headerTooltip='Language Proficiency Score (average translation and classification scores, after min-max normalization)'
|
| 18 |
sortable
|
| 19 |
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
| 20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|