from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("anli_r1", "acc", "ANLI") task1 = Task("logiqa", "acc_norm", "LogiQA") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = """

PROBE

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ # (Protein RepresentatiOn BEnchmark): Function-Centric Evaluation of Protein Representation Method """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" # PROBE (Protein RepresentatiOn BEnchmark): Function-Centric Evaluation of Protein Representation Methods ### PROBE runs benchmark analyses on protein representation/feature vectors of any representation learning method in order to evaluate its predictive performance on protein function related predictive tasks, and to and compare it other methods from literature. ### Aiming to evaluate how much each representation model captures different facets of functional information, we constructed and applied 4 independent benchmark tests based on: 1. Semantic Similarity Inference: - This benchmark evaluates how well protein representation models can infer functional similarities between proteins. Ground truth functional similarities are derived from Gene Ontology (GO) annotations. - Different distance metrics (Cosine, Manhattan, Euclidean) are used to compute protein vector similarities, which are then correlated with the functional similarities. 2. Ontology-Based Protein Function Prediction (PFP): - This benchmark assesses the ability of representation models to predict ontology-based functional annotations (GO terms). The models are tested on how well they classify proteins based on molecular functions, biological processes, and cellular components. - A linear classifier is used to ensure that the models themselves are responsible for good performance, rather than the complexity of the classifier. 3. Drug Target Protein Family Classification: - This benchmark focuses on predicting the family of drug target proteins (enzymes, receptors, ion channels, etc.). This task tests the ability of models to learn structural features critical to these classifications. - The study evaluates models using datasets with varying sequence similarity thresholds (random, 50%, 30%, 15%) to ensure the models can predict beyond simple sequence similarity. 4. Protein–Protein Binding Affinity Estimation: - This benchmark evaluates models' ability to predict the change in binding affinities between proteins due to mutations. The dataset used is the **SKEMPI** dataset, which contains experimentally determined binding affinities. - The task measures how well models can extract critical structural features important for protein-protein interactions. ### PROBE is part of the the study entitled [Learning functional properties of proteins with language models](https://rdcu.be/cJAKN) which is schematically summarized in the figure below:
![Summary of The Study](https://raw.githubusercontent.com/serbulent/TrainableRepresentationAnalysis/refs/heads/master/evalprotrep_summary_figure.jpg) ### If you find PROBE useful please consider citing! """ EVALUATION_QUEUE_TEXT = """ # **Benchmarking your own representation model** ## To run the benchmarks, the following representation vectors need to be generated: - For benchmarks 1, 2, and 3 (similarity, function, and family), you will need to generate representation vectors for all human proteins. The amino acid sequences for canonical isoforms of human proteins can be found [here](https://drive.google.com/file/d/1wXF2lmj4ZTahMrl66QpYM2TvHmbcIL6b/view?usp=sharing). - For benchmark 4 (affinity), representation vectors will need to be generated for the samples in the SKEMPI dataset, which can be accessed [here](https://drive.google.com/file/d/1m5jssC0RMsiFT_w-Ykh629Pw_An3PInI/view?usp=sharing). ## Format of the both protein representation files: 1. Each row corresponds to the representation vector of a particular protein. 2. Rows & columns: first column's header one should be "Entry", and the rest of the column headers should contain the index number that correspond to dimensions of the vector. After the column header, the rows of the first column should contain the UniProt protein accessions of respective proteins (i.e., each row in this file corresponds to a different protein), rows of other columns should contain representation vector values for the corresponding proteins (i.e. each column in this file corresponds to a dimension of representation vectors). 3. All representation vectors in a file should have the same size (i.e., fixed sized vectors). 4. Representation vectors of the whole dataset should be saved in a comma separated (csv) text file. """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""Unsal, S., Atas, H., Albayrak, M., Turhan, K., Acar, A. C., & Doğan, T. (2022). Learning functional properties of proteins with language models. *Nature Machine Intelligence, 4*(3), 227-245. """ TASK_INFO = ["similarity", "function", "family", "affinity"] CSV_RESULT_PATH = "./src/data/results.csv" LEADERBOARD_INTRODUCTION = """

PROBE Leaderboard: Protein Representation Model Evaluation

Welcome to the PROBE (Protein RepresentatiOn BEnchmark) leaderboard! This platform evaluates protein representation models based on their ability to capture functional properties of proteins through four key benchmarks: - **Protein Similarity**: Inferring semantic similarities. - **Protein Function**: Predicting ontology-based functions. - **Protein Family**: Classifying drug target families. - **Protein Affinity**: Estimating binding affinities. Submit your own representation models and compare their performance across these tasks. For more details on how to participate, see the submission guidelines. If you find PROBE useful, please consider citing our work.""" similarity_tasks_options = ["sparse", "200", "500"] function_prediction_aspect_options = ["MF", "BP", "CC", "All_Aspects"] function_prediction_dataset_options = ["High", "Middle", "Low", "All_Data_Sets"] family_prediction_dataset_options = ["nc", "uc50", "uc30", "mm15"] benchmark_specific_metrics = { 'similarity': ['similarity_corr_MF', 'similarity_corr_BP', 'similarity_corr_CC', 'similarity_corr_Ave'], 'Benchmark 2': ['Metric B1', 'Metric B2', 'Metric B3'], 'Benchmark 3': ['Metric C1', 'Metric C2', 'Metric C3'], 'affinity': ['affinity_mse_train', 'affinity_mse_val'] } display_labels = [ 'BLAST', 'HMMER', 'K-SEP', 'APAAC', 'PFAM', 'AAC', 'PROTVEC', 'GENE2VEC', 'LEARNED-VEC', 'MUT2VEC', 'TCGA-EMBEDDING', 'CPCPROT', 'SEQVEC', 'ProtBERT-BFD', 'TAPE-BERT-PFAM', 'ESM-1b', 'ProtALBERT', 'ProtXLNet', 'UNIREP', 'ProtT5-XL', 'MSA-Transformer' ] group_colors = [ 'green', 'green', 'green', 'green', 'green', 'green', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red' ] # Create a color dictionary from the labels and colors color_dict = {method.upper(): color for method, color in zip(display_labels, group_colors)}