Spaces:

hlnicholls
/

nucleotide_transformer_benchmark

Sleeping

File size: 10,441 Bytes

d0b5dce
eec7921
 
 
d0b5dce
eec7921
d0b5dce
 
 
0ad2349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eec7921
 
561f8a4
 
eec7921
 
 
 
561f8a4
d32df58
eec7921
 
d0b5dce
eec7921
2a1736a
eec7921
2a1736a
1e9588a
 
 
 
 
 
3b36d4e
1e9588a
 
2a1736a
1e9588a
 
c10b04b
eec7921
 
d0b5dce
eec7921
 
 
d0b5dce
eec7921
 
d0b5dce
0ad2349
 
 
 
 
d0b5dce
0ad2349
eec7921
 
 
 
d0b5dce
 
 
eec7921
d0b5dce
eec7921
 
 
 
d0b5dce
 
eec7921
45c5007
eec7921
 
 
 
 
 
 
5c93746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45c5007
5c93746
 
 
 
 
 
 
 
 
 
 
 
 
eec7921
 
940a8fb
 
c10b04b
eec7921
 
 
d0b5dce
eec7921
 
 
 
d0b5dce
eec7921
 
 
 
 
0ad2349
 
 
a5c85f4
0ad2349
a5c85f4
0ad2349
 
 
 
a5c85f4
0ad2349
a5c85f4
0ad2349
 
 
a5c85f4
0ad2349
a5c85f4
eec7921
d0b5dce
eec7921
 
d0b5dce
 
 
eec7921
5c93746
 
 
c1d9995
 
5c93746
 
a13c03f
c10b04b
eec7921
 
 
 
 
 
d0b5dce
 
eec7921
 
45c5007
 
eec7921
0ad2349
 
 
 
 
 
 
 
 
 
 
d0b5dce
0ad2349
d0b5dce
 
 
 
0ad2349
d0b5dce
 
 
 
0ad2349
d0b5dce
 
 
 
0ad2349
d0b5dce
 
eec7921
5c93746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eec7921

from typing import List

import gradio as gr
import numpy as np
import pandas as pd

_ORIGINAL_DF = pd.read_csv("./data/benchmark.csv")
_METRICS = ["MCC", "F1", "ACC"]
_AGGREGATION_METHODS = ["mean", "max", "min", "median"]
_TASKS = {
    "histone_marks": [
        "H4",
        "H3",
        "H3K14ac",
        "H3K4me1",
        "H3K4me3",
        "H3K4me2",
        "H3K36me3",
        "H4ac",
        "H3K79me3",
        "H3K9ac",
    ],
    "regulatory_elements": [
        "promoter_no_tata",
        "enhancers",
        "enhancers_types",
        "promoter_all",
        "promoter_tata",
    ],
    "RNA_production": [
        "splice_sites_donors",
        "splice_sites_all",
        "splice_sites_acceptors",
    ],
}

_BIBTEX = """@article{DallaTorre2023TheNT,
  title={The Nucleotide Transformer: Building and Evaluating Robust Foundation Models for Human Genomics},
  author={Hugo Dalla-Torre and Liam Gonzalez and Javier Mendoza Revilla and Nicolas Lopez Carranza and Adam Henryk Grzywaczewski and Francesco Oteri and Christian Dallago and Evan Trop and Hassan Sirelkhatim and Guillaume Richard and Marcin J. Skwark and Karim Beguir and Marie Lopez and Thomas Pierrot},
  journal={bioRxiv},
  year={2023},
  url={https://api.semanticscholar.org/CorpusID:255943445}
}
"""  # noqa
_LAST_UPDATED = "Sept 15, 2023"

banner_url = "./assets/logo.png"
_BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'  # noqa

_INTRODUCTION_TEXT = """The 🤗 Nucleotide Transformer Leaderboard aims to track, rank and evaluate DNA foundational models on a set of curated downstream tasks introduced in the huggingface dataset [nucleotide_transformer_downstream_tasks](https://huggingface.co/datasets/InstaDeepAI/nucleotide_transformer_downstream_tasks), with a standardized evaluation protocol presented in the "ℹ️ Methods" tab.\n\n

This leaderboard has been designed to provide, to the best of our ability, fair and robust comparisons between models. If you have any question or concern regarding our methodology or if you would like another model to appear in this leaderboard, please reach out to m.lopez@instadeep.com and t.pierrot@instadeep.com. While we may not be able to take into consideration all requests, the team will always do its best to ensure that benchmark stays as fair, relevant and up-to-date as possible.\n\n
 """  # noqa

_METHODS_TEXT = """
This leaderboard uses the downstream tasks benchmark and evaluation methdology described in the Nucleotide Transformer paper. We fine-tune each model on each task using a ten-fold validation strategy. For each model and each task, we report the aggregation over the ten-folds for several metrics - the Matthew Correlation Coefficient (MCC), the macro f1-score (F1) and the accuracy (ACC). The Nucleotide Transformer, DNABert and Enformer models have been fine-tuned using the same parameter efficient fine-tuning technique (IA3) with the same set of hyper-parameters. Due to the different nature of their architecture, the HyenaDNA models have been fully-finetuned using the original code provided by the authors.
\n\n

Please keep in mind that the Enformer has been originally trained in a supervised fashion to solve gene expression tasks. For the sake of benchmarking, we re-used the provided model torso as a pre-trained model for our benchmark, which is not the intended and recommended use of the original paper. Though we think this comparison is interesting to highlight the differences between self-supervised and supervised learning for pre-training and observe that the Enformer is a very competitive baseline even for tasks that differ from gene expression.
\n\n

For the sake of clarity the tasks being shown by default in this leaderboard are the human related tasks while the original Nucleotide Transformer paper shows performance over both yeast and human related tasks. To obtain the same results as the one shown in the paper, please check all the tasks boxes above.
\n\n
"""  # noqa


def retrieve_array_from_text(text):
    return np.fromstring(text.replace("[", "").replace("]", ""), dtype=float, sep=",")


def format_number(x):
    return float(f"{x:.3}")


def get_dataset(
    histone_tasks: List[str],
    regulatory_tasks: List[str],
    rna_tasks: List[str],
    target_metric: str = "MCC",
    aggregation_method: str = "mean",
):
    tasks = histone_tasks + regulatory_tasks + rna_tasks

    aggr_fn = getattr(np, aggregation_method)
    scores = _ORIGINAL_DF[target_metric].apply(retrieve_array_from_text).apply(aggr_fn)
    scores = scores.apply(format_number)
    df = _ORIGINAL_DF.drop(columns=_METRICS)
    df["Score"] = scores
    df = df.pivot(index="Model", columns="Dataset", values="Score")
    df = df[tasks]
    df["All Tasks"] = df.agg("mean", axis="columns").apply(format_number)
    columns = list(df.columns.values)
    columns.sort()
    df = df[columns]
    df.reset_index(inplace=True)
    df = df.rename(columns={"index": "Model"})
    df = df.sort_values(by=["All Tasks"], ascending=False)

    leaderboard_table = gr.components.Dataframe(
        value=df,
        interactive=False,
        visible=True,
    )
    return leaderboard_table


def get_bar_plot(
    histone_tasks: List[str],
    regulatory_tasks: List[str],
    rna_tasks: List[str],
    target_metric: str = "MCC",
    aggregation_method: str = "mean",
):
    tasks = histone_tasks + regulatory_tasks + rna_tasks

    aggr_fn = getattr(np, aggregation_method)
    scores = _ORIGINAL_DF[target_metric].apply(retrieve_array_from_text).apply(aggr_fn)
    scores = scores.apply(format_number)
    df = _ORIGINAL_DF.drop(columns=_METRICS)
    df["Score"] = scores / len(tasks)
    df = df.query(f"Dataset == {tasks}")

    bar_plot = gr.BarPlot(
        df,
        x="Model",
        y="Score",
        color="Dataset",
        width=500,
        x_label_angle=-45,
        x_title="Model",
        y_title="Score",
        color_legend_title="Downstream Task",
    )
    return bar_plot


with gr.Blocks() as demo:
    with gr.Row():
        gr.Image(banner_url, height=160, scale=1)
        gr.Markdown(_INTRODUCTION_TEXT, elem_classes="markdown-text")
        # gr.Textbox(_INTRODUCTION_TEXT, scale=5)

    with gr.Row():
        metric_choice = gr.Dropdown(
            choices=_METRICS,
            value="MCC",
            label="Metric displayed.",
        )
        aggr_choice = gr.Dropdown(
            choices=_AGGREGATION_METHODS,
            value="mean",
            label="Aggregation used over 10-folds.",
        )

    with gr.Row():
        regulatory_tasks = gr.CheckboxGroup(
            choices=_TASKS["regulatory_elements"],
            value=_TASKS["regulatory_elements"],
            label="Regulatory Elements Downstream Tasks.",
            info="Human data.",
            scale=3,
        )
        rna_tasks = gr.CheckboxGroup(
            choices=_TASKS["RNA_production"],
            value=_TASKS["RNA_production"],
            label="RNA Production Downstream Tasks.",
            info="Human data.",
            scale=3,
        )
        histone_tasks = gr.CheckboxGroup(
            choices=_TASKS["histone_marks"],
            label="Histone Modification Downstream Tasks.",
            info="Yeast data.",
            scale=4,
        )

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
            dataframe = gr.components.Dataframe(
                elem_id="leaderboard-table",
            )

        with gr.TabItem("📈 Graph", elem_id="od-benchmark-tab-table", id=2):
            bar_plot = gr.BarPlot(
                elem_id="leaderboard-bar-plot",
                x="Models",
                y="Score",
            )

        with gr.TabItem("ℹ️ Methods", elem_id="od-benchmark-tab-table", id=1):
            gr.Markdown(_METHODS_TEXT, elem_classes="markdown-text")

    gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")

    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            gr.Textbox(
                value=_BIBTEX,
                lines=7,
                label="Copy the BibTeX snippet to cite this source",
                elem_id="citation-button",
                show_copy_button=True
            )

    histone_tasks.change(
        get_dataset,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=dataframe,
    )
    regulatory_tasks.change(
        get_dataset,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=dataframe,
    )
    rna_tasks.change(
        get_dataset,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=dataframe,
    )
    metric_choice.change(
        get_dataset,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=dataframe,
    )
    aggr_choice.change(
        get_dataset,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=dataframe,
    )
    demo.load(
        fn=get_dataset,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=dataframe,
    )

    histone_tasks.change(
        get_bar_plot,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=bar_plot,
    )
    regulatory_tasks.change(
        get_bar_plot,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=bar_plot,
    )
    rna_tasks.change(
        get_bar_plot,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=bar_plot,
    )
    metric_choice.change(
        get_bar_plot,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=bar_plot,
    )
    aggr_choice.change(
        get_bar_plot,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=bar_plot,
    )
    demo.load(
        fn=get_bar_plot,
        inputs=[histone_tasks, regulatory_tasks, rna_tasks, metric_choice, aggr_choice],
        outputs=bar_plot,
    )

demo.launch()