diff --git a/.gitignore b/.gitignore index 100ab8861c846a556d7540e1fe2ab4cd72ad5e00..a8de302f89b0af22888ac234e3b0d1a6f208a591 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ __pycache__/ #eval-queue-bk/ #eval-results-bk/ logs/ +.idea/ + diff --git a/.idea/OmniGenomeLeaderboard.iml b/.idea/OmniGenomeLeaderboard.iml new file mode 100644 index 0000000000000000000000000000000000000000..ec63674cd7f4d511fb06cd63eaeba166d6bc0dd8 --- /dev/null +++ b/.idea/OmniGenomeLeaderboard.iml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000000000000000000000000000000000000..5366c82fe56453c92209a591340f4ded829c1441 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,88 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..a5fe07b7a5ab8aa8bd5661a6ae8a7cc36d74e052 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..35eb1ddfbbc029bcab630581847471d7f238ec53 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml index f4557d4f6e138a5677cdffeefbdff11ef0e0c204..b99212d75d144c01b2ab00e0dda6e6238f4d79ff 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -5,8 +5,57 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - @@ -94,7 +143,9 @@ - + + + @@ -134,6 +185,6 @@ - + \ No newline at end of file diff --git a/app.py b/app.py index 22109eb72f0047457e17bf86e3d1934c9bc5144d..7e18c3fa4b51124de631038d7ba61a77a8fa32c3 100644 --- a/app.py +++ b/app.py @@ -35,31 +35,30 @@ def restart_space(): ### Space initialisation - -try: - print(EVAL_REQUESTS_PATH) - snapshot_download( - repo_id=QUEUE_REPO, - local_dir=EVAL_REQUESTS_PATH, - repo_type="dataset", - tqdm_class=None, - etag_timeout=30, - token=TOKEN, - ) -except Exception: - restart_space() -try: - print(EVAL_RESULTS_PATH) - snapshot_download( - repo_id=RESULTS_REPO, - local_dir=EVAL_RESULTS_PATH, - repo_type="dataset", - tqdm_class=None, - etag_timeout=30, - token=TOKEN, - ) -except Exception: - restart_space() +# try: +# print(EVAL_REQUESTS_PATH) +# snapshot_download( +# repo_id=QUEUE_REPO, +# local_dir=EVAL_REQUESTS_PATH, +# repo_type="dataset", +# tqdm_class=None, +# etag_timeout=30, +# token=TOKEN, +# ) +# except Exception: +# restart_space() +# try: +# print(EVAL_RESULTS_PATH) +# snapshot_download( +# repo_id=RESULTS_REPO, +# local_dir=EVAL_RESULTS_PATH, +# repo_type="dataset", +# tqdm_class=None, +# etag_timeout=30, +# token=TOKEN, +# ) +# except Exception: +# restart_space() RGB_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/RGB/", EVAL_REQUESTS_PATH+"/RGB/", RGB_COLS, RGB_BENCHMARK_COLS) PGB_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/PGB/", EVAL_REQUESTS_PATH+"/PGB/", PGB_COLS, PGB_BENCHMARK_COLS) diff --git a/eval-queue/GB/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json b/eval-queue/GB/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2471a5c9364821d4d97616d03ed65047aaac92fc --- /dev/null +++ b/eval-queue/GB/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0, "params": 96, "license": "custom"} \ No newline at end of file diff --git a/eval-queue/GB/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16_eval_request_False_bfloat16_Original.json b/eval-queue/GB/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0ea7d38471486d3d8f8e0df988dfd653e456f51a --- /dev/null +++ b/eval-queue/GB/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0, "params": 7.73, "license": "custom"} \ No newline at end of file diff --git a/eval-queue/GUE/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json b/eval-queue/GUE/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2471a5c9364821d4d97616d03ed65047aaac92fc --- /dev/null +++ b/eval-queue/GUE/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0, "params": 96, "license": "custom"} \ No newline at end of file diff --git a/eval-queue/GUE/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16_eval_request_False_bfloat16_Original.json b/eval-queue/GUE/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0ea7d38471486d3d8f8e0df988dfd653e456f51a --- /dev/null +++ b/eval-queue/GUE/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0, "params": 7.73, "license": "custom"} \ No newline at end of file diff --git a/eval-queue/PGB/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json b/eval-queue/PGB/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2471a5c9364821d4d97616d03ed65047aaac92fc --- /dev/null +++ b/eval-queue/PGB/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0, "params": 96, "license": "custom"} \ No newline at end of file diff --git a/eval-queue/PGB/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16_eval_request_False_bfloat16_Original.json b/eval-queue/PGB/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0ea7d38471486d3d8f8e0df988dfd653e456f51a --- /dev/null +++ b/eval-queue/PGB/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0, "params": 7.73, "license": "custom"} \ No newline at end of file diff --git a/eval-queue/RGB/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json b/eval-queue/RGB/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b253088dd1195bd80fc63e2184ecd9231d02cc14 --- /dev/null +++ b/eval-queue/RGB/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0, "params": 100, "license": "custom"} \ No newline at end of file diff --git a/eval-results/GB/3UTRBERT.json b/eval-results/GB/3UTRBERT.json new file mode 100644 index 0000000000000000000000000000000000000000..ce357a624ac6d43f25bc2ecdbcc16c11fe9787e3 --- /dev/null +++ b/eval-results/GB/3UTRBERT.json @@ -0,0 +1,48 @@ +{ + "config":{ + "model":"multimolecule/utrbert-4mer", + "model_args":"pretrained=multimolecule/utrbert-4mer,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"k-mers", "# of Params":"86M", "Pretraining Data":"20,362 Sequences", "Species":"Multi-Species", "Nucleic Acid":"mRNA 3'UTR"}, + "model_dtype":"bfloat16", + "model_name":"multimolecule/utrbert-4mer", + "model_sha":"main" + }, + "results":{ + "DEM":{ + "F1":0.8950 + }, + "DOW":{ + "F1":0.9022 + }, + "DRE":{ + "F1":0.7435 + }, + "DME":{ + "F1":0.8014 + }, + "HCE":{ + "F1":0.7023 + }, + "HEE":{ + "F1":0.7633 + }, + "HRE":{ + "F1":0.9847 + }, + "HNP":{ + "F1":0.8249 + }, + "HOR":{ + "F1":0.6678 + } + } +} \ No newline at end of file diff --git a/eval-results/GB/Caduceus.json b/eval-results/GB/Caduceus.json new file mode 100644 index 0000000000000000000000000000000000000000..4240022d6b17dd8de28fc9e7dc9c5a1873760630 --- /dev/null +++ b/eval-results/GB/Caduceus.json @@ -0,0 +1,48 @@ +{ + "config":{ + "model":"kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16", + "model_args":"pretrained= kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"7.73M", "Pretraining Data":"35 billion nucleotide base pairs", "Species":"Human", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16", + "model_sha":"main" + }, + "results":{ + "DEM":{ + "F1":0.9213 + }, + "DOW":{ + "F1":0.9474 + }, + "DRE":{ + "F1":0.7203 + }, + "DME":{ + "F1":0.7561 + }, + "HCE":{ + "F1":0.7020 + }, + "HEE":{ + "F1":0.7647 + }, + "HRE":{ + "F1":0.7916 + }, + "HNP":{ + "F1":0.8436 + }, + "HOR":{ + "F1":0.6317 + } + } +} \ No newline at end of file diff --git a/eval-results/GB/DNABERT-2-117M.json b/eval-results/GB/DNABERT-2-117M.json new file mode 100644 index 0000000000000000000000000000000000000000..dc6e4754bd33a1a1da61ab44d2c2bbb87dc1b96a --- /dev/null +++ b/eval-results/GB/DNABERT-2-117M.json @@ -0,0 +1,48 @@ +{ + "config":{ + "model":"zhihan1996/DNABERT-2-117M", + "model_args":"pretrained=zhihan1996/DNABERT-2-117M,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"BPE", "# of Params":"117M", "Pretraining Data":"32.49B Tokens", "Species":"Human + 135 Species", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"zhihan1996/DNABERT-2-117M", + "model_sha":"main" + }, + "results":{ + "DEM":{ + "F1":0.9267 + }, + "DOW":{ + "F1":0.9517 + }, + "DRE":{ + "F1":0.4377 + }, + "DME":{ + "F1":0.7721 + }, + "HCE":{ + "F1":0.7558 + }, + "HEE":{ + "F1":0.8066 + }, + "HRE":{ + "F1":0.7814 + }, + "HNP":{ + "F1":0.8580 + }, + "HOR":{ + "F1":0.6803 + } + } +} \ No newline at end of file diff --git a/eval-results/GB/HyenaDNA.json b/eval-results/GB/HyenaDNA.json new file mode 100644 index 0000000000000000000000000000000000000000..e16541883eeac9d04b2bff41acbb2800fbb61d17 --- /dev/null +++ b/eval-results/GB/HyenaDNA.json @@ -0,0 +1,48 @@ +{ + "config":{ + "model":"LongSafari/hyenadna-large-1m-seqlen-hf", + "model_args":"pretrained=LongSafari/hyenadna-large-1m-seqlen-hf,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"47M", "Pretraining Data":"~3.2B Tokens", "Species":"Human", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"LongSafari/hyenadna-large-1m-seqlen-hf", + "model_sha":"main" + }, + "results":{ + "DEM":{ + "F1":0.8821 + }, + "DOW":{ + "F1":0.9413 + }, + "DRE":{ + "F1":0.7011 + }, + "DME":{ + "F1":0.7644 + }, + "HCE":{ + "F1":0.7038 + }, + "HEE":{ + "F1":0.7958 + }, + "HRE":{ + "F1":0.9633 + }, + "HNP":{ + "F1":0.8599 + }, + "HOR":{ + "F1":0.6703 + } + } +} \ No newline at end of file diff --git a/eval-results/GB/NT-V2-100M.json b/eval-results/GB/NT-V2-100M.json new file mode 100644 index 0000000000000000000000000000000000000000..04e8c8e3026dee318fba5ff3f68d2b32f1cfe7f7 --- /dev/null +++ b/eval-results/GB/NT-V2-100M.json @@ -0,0 +1,48 @@ +{ + "config":{ + "model":"InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", + "model_args":"pretrained=InstaDeepAI/nucleotide-transformer-v2-100m-multi-species,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"k-mers", "# of Params":"96M", "Pretraining Data":"300B Tokens", "Species":"Human + 850 Species", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", + "model_sha":"main" + }, + "results":{ + "DEM":{ + "F1":0.9166 + }, + "DOW":{ + "F1":0.9432 + }, + "DRE":{ + "F1":0.7820 + }, + "DME":{ + "F1":0.8172 + }, + "HCE":{ + "F1":0.7198 + }, + "HEE":{ + "F1":0.7985 + }, + "HRE":{ + "F1":0.9330 + }, + "HNP":{ + "F1":0.8530 + }, + "HOR":{ + "F1":0.6853 + } + } +} \ No newline at end of file diff --git a/eval-results/GB/OmniGenome186M.json b/eval-results/GB/OmniGenome186M.json new file mode 100644 index 0000000000000000000000000000000000000000..4f53aeb07e97ab71bbc8f78e16fa87545fb0c43c --- /dev/null +++ b/eval-results/GB/OmniGenome186M.json @@ -0,0 +1,48 @@ +{ + "config":{ + "model":"yangheng/omnigenome-186M", + "model_args":"pretrained=yangheng/omnigenome-186M,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"186M", "Pretraining Data":"54.2B Tokens", "Species":"1124 Plant Species", "Nucleic Acid":"mRNA, CDS, UTR"}, + "model_dtype":"bfloat16", + "model_name":"yangheng/omnigenome-186M", + "model_sha":"main" + }, + "results":{ + "DEM":{ + "F1":0.9416 + }, + "DOW":{ + "F1":0.9349 + }, + "DRE":{ + "F1":0.7717 + }, + "DME":{ + "F1":0.8034 + }, + "HCE":{ + "F1":0.7351 + }, + "HEE":{ + "F1":0.8223 + }, + "HRE":{ + "F1":0.9566 + }, + "HNP":{ + "F1":0.8787 + }, + "HOR":{ + "F1":0.6897 + } + } +} \ No newline at end of file diff --git a/eval-results/GB/SpliceBERT.json b/eval-results/GB/SpliceBERT.json new file mode 100644 index 0000000000000000000000000000000000000000..ea4b1c7c66306ef5379832c7558d16a550a55561 --- /dev/null +++ b/eval-results/GB/SpliceBERT.json @@ -0,0 +1,48 @@ +{ + "config":{ + "model":"multimolecule/splicebert", + "model_args":"pretrained=multimolecule/splicebert,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"19.7M", "Pretraining Data":"65 billion nucleotides", "Species":"Multi-Species", "Nucleic Acid":"mRNA"}, + "model_dtype":"bfloat16", + "model_name":"multimolecule/splicebert", + "model_sha":"main" + }, + "results":{ + "DEM":{ + "F1":0.9472 + }, + "DOW":{ + "F1":0.9642 + }, + "DRE":{ + "F1":0.7229 + }, + "DME":{ + "F1":0.7470 + }, + "HCE":{ + "F1":0.7350 + }, + "HEE":{ + "F1":0.7960 + }, + "HRE":{ + "F1":0.9523 + }, + "HNP":{ + "F1":0.8957 + }, + "HOR":{ + "F1":0.6889 + } + } +} \ No newline at end of file diff --git a/eval-results/GUE/3UTRBERT.json b/eval-results/GUE/3UTRBERT.json new file mode 100644 index 0000000000000000000000000000000000000000..95d23ee36786021be7ec587e5b278036700807fc --- /dev/null +++ b/eval-results/GUE/3UTRBERT.json @@ -0,0 +1,42 @@ +{ + "config":{ + "model":"multimolecule/utrbert-4mer", + "model_args":"pretrained=multimolecule/utrbert-4mer,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"k-mers", "# of Params":"86M", "Pretraining Data":"20,362 Sequences", "Species":"Multi-Species", "Nucleic Acid":"mRNA 3'UTR"}, + "model_dtype":"bfloat16", + "model_name":"multimolecule/utrbert-4mer", + "model_sha":"main" + }, + "results":{ + "Yeast EMP":{ + "F1":0.7189 + }, + "Mouse TF-M":{ + "F1":0.7146 + }, + "Virus CVC":{ + "F1":0.6871 + }, + "Human TF-H":{ + "F1":0.7485 + }, + "Human PD":{ + "F1":0.8237 + }, + "Human CPD":{ + "F1":0.9051 + }, + "Human SSP":{ + "F1":0.8195 + } + } +} \ No newline at end of file diff --git a/eval-results/GUE/Caduceus.json b/eval-results/GUE/Caduceus.json new file mode 100644 index 0000000000000000000000000000000000000000..5efcd074500e32eb68a3417305640a6e65ad74bb --- /dev/null +++ b/eval-results/GUE/Caduceus.json @@ -0,0 +1,42 @@ +{ + "config":{ + "model":"kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16", + "model_args":"pretrained= kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"7.73M", "Pretraining Data":"35 billion nucleotide base pairs", "Species":"Human", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16", + "model_sha":"main" + }, + "results":{ + "Yeast EMP":{ + "F1":0.7349 + }, + "Mouse TF-M":{ + "F1":0.7818 + }, + "Virus CVC":{ + "F1":0.4909 + }, + "Human TF-H":{ + "F1":0.7956 + }, + "Human PD":{ + "F1":0.8913 + }, + "Human CPD":{ + "F1":0.8509 + }, + "Human SSP":{ + "F1":0.8182 + } + } +} \ No newline at end of file diff --git a/eval-results/GUE/DNABERT-2-117M.json b/eval-results/GUE/DNABERT-2-117M.json new file mode 100644 index 0000000000000000000000000000000000000000..9ec4062de7f854995b73467107515d49ae9c4fb6 --- /dev/null +++ b/eval-results/GUE/DNABERT-2-117M.json @@ -0,0 +1,42 @@ +{ + "config":{ + "model":"zhihan1996/DNABERT-2-117M", + "model_args":"pretrained=zhihan1996/DNABERT-2-117M,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"BPE", "# of Params":"117M", "Pretraining Data":"32.49B Tokens", "Species":"Human + 135 Species", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"zhihan1996/DNABERT-2-117M", + "model_sha":"main" + }, + "results":{ + "Yeast EMP":{ + "F1":0.7585 + }, + "Mouse TF-M":{ + "F1":0.8623 + }, + "Virus CVC":{ + "F1":0.6890 + }, + "Human TF-H":{ + "F1":0.8180 + }, + "Human PD":{ + "F1":0.9017 + }, + "Human CPD":{ + "F1":0.8257 + }, + "Human SSP":{ + "F1":0.8521 + } + } +} \ No newline at end of file diff --git a/eval-results/GUE/HyenaDNA.json b/eval-results/GUE/HyenaDNA.json new file mode 100644 index 0000000000000000000000000000000000000000..2e8f09165601c1cccda8918d51e6785afef7cc75 --- /dev/null +++ b/eval-results/GUE/HyenaDNA.json @@ -0,0 +1,42 @@ +{ + "config":{ + "model":"LongSafari/hyenadna-large-1m-seqlen-hf", + "model_args":"pretrained=LongSafari/hyenadna-large-1m-seqlen-hf,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"47M", "Pretraining Data":"~3.2B Tokens", "Species":"Human", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"LongSafari/hyenadna-large-1m-seqlen-hf", + "model_sha":"main" + }, + "results":{ + "Yeast EMP":{ + "F1":0.7308 + }, + "Mouse TF-M":{ + "F1":0.7344 + }, + "Virus CVC":{ + "F1":0.6637 + }, + "Human TF-H":{ + "F1":0.7762 + }, + "Human PD":{ + "F1":0.9119 + }, + "Human CPD":{ + "F1":0.8431 + }, + "Human SSP":{ + "F1":0.8334 + } + } +} \ No newline at end of file diff --git a/eval-results/GUE/NT-V2-100M.json b/eval-results/GUE/NT-V2-100M.json new file mode 100644 index 0000000000000000000000000000000000000000..2b3a39d97cf155bf7888c9cdd54c03c6bddfd8b9 --- /dev/null +++ b/eval-results/GUE/NT-V2-100M.json @@ -0,0 +1,42 @@ +{ + "config":{ + "model":"InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", + "model_args":"pretrained=InstaDeepAI/nucleotide-transformer-v2-100m-multi-species,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"k-mers", "# of Params":"96M", "Pretraining Data":"300B Tokens", "Species":"Human + 850 Species", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", + "model_sha":"main" + }, + "results":{ + "Yeast EMP":{ + "F1":0.7493 + }, + "Mouse TF-M":{ + "F1":0.7810 + }, + "Virus CVC":{ + "F1":0.5923 + }, + "Human TF-H":{ + "F1":0.7912 + }, + "Human PD":{ + "F1":0.9087 + }, + "Human CPD":{ + "F1":0.8470 + }, + "Human SSP":{ + "F1":0.8413 + } + } +} \ No newline at end of file diff --git a/eval-results/GUE/OmniGenome186M.json b/eval-results/GUE/OmniGenome186M.json new file mode 100644 index 0000000000000000000000000000000000000000..7a8f30ed4659430d7155800e51113eb113cea5ea --- /dev/null +++ b/eval-results/GUE/OmniGenome186M.json @@ -0,0 +1,42 @@ +{ + "config":{ + "model":"yangheng/omnigenome-186M", + "model_args":"pretrained=yangheng/omnigenome-186M,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"186M", "Pretraining Data":"54.2B Tokens", "Species":"1124 Plant Species", "Nucleic Acid":"mRNA, CDS, UTR"}, + "model_dtype":"bfloat16", + "model_name":"yangheng/omnigenome-186M", + "model_sha":"main" + }, + "results":{ + "Yeast EMP":{ + "F1":0.7851 + }, + "Mouse TF-M":{ + "F1":0.8472 + }, + "Virus CVC":{ + "F1":0.7472 + }, + "Human TF-H":{ + "F1":0.8173 + }, + "Human PD":{ + "F1":0.9004 + }, + "Human CPD":{ + "F1":0.8522 + }, + "Human SSP":{ + "F1":0.9039 + } + } +} \ No newline at end of file diff --git a/eval-results/GUE/SpliceBERT.json b/eval-results/GUE/SpliceBERT.json new file mode 100644 index 0000000000000000000000000000000000000000..e68ed40e24cf3d44c7e36b5e0126543abdc237ac --- /dev/null +++ b/eval-results/GUE/SpliceBERT.json @@ -0,0 +1,42 @@ +{ + "config":{ + "model":"multimolecule/splicebert", + "model_args":"pretrained=multimolecule/splicebert,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"19.7M", "Pretraining Data":"65 billion nucleotides", "Species":"Multi-Species", "Nucleic Acid":"mRNA"}, + "model_dtype":"bfloat16", + "model_name":"multimolecule/splicebert", + "model_sha":"main" + }, + "results":{ + "Yeast EMP":{ + "F1":0.7766 + }, + "Mouse TF-M":{ + "F1":0.8497 + }, + "Virus CVC":{ + "F1":0.5624 + }, + "Human TF-H":{ + "F1":0.8277 + }, + "Human PD":{ + "F1":0.9224 + }, + "Human CPD":{ + "F1":0.8396 + }, + "Human SSP":{ + "F1":0.9381 + } + } +} \ No newline at end of file diff --git a/eval-results/PGB/3UTRBERT.json b/eval-results/PGB/3UTRBERT.json new file mode 100644 index 0000000000000000000000000000000000000000..095b1f90d8ed20191ab55b722ed6a4ec48c26e48 --- /dev/null +++ b/eval-results/PGB/3UTRBERT.json @@ -0,0 +1,45 @@ +{ + "config":{ + "model":"multimolecule/utrbert-4mer", + "model_args":"pretrained=multimolecule/utrbert-4mer,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"k-mers", "# of Params":"86M", "Pretraining Data":"20,362 Sequences", "Species":"Multi-Species", "Nucleic Acid":"mRNA 3'UTR"}, + "model_dtype":"bfloat16", + "model_name":"multimolecule/utrbert-4mer", + "model_sha":"main" + }, +"results":{ + "PolyA":{ + "F1":0.7648 + }, + "LncRNA":{ + "F1":0.7075 + }, + "Chrom Acc":{ + "F1":0.6371 + }, + "Prom Str":{ + "RMSE":1.04 + }, + "Term Str":{ + "RMSE":0.36 + }, + "Splice":{ + "F1":0.9444 + }, + "Gene Exp":{ + "RMSE":14.87 + }, + "Enhancer":{ + "F1":0.7167 + } + } +} \ No newline at end of file diff --git a/eval-results/PGB/Agro-NT.json b/eval-results/PGB/Agro-NT.json new file mode 100644 index 0000000000000000000000000000000000000000..9c1cf5acde27b681f67c9ca8e23b5bd3b9c6e63e --- /dev/null +++ b/eval-results/PGB/Agro-NT.json @@ -0,0 +1,45 @@ +{ + "config":{ + "model":"InstaDeepAI/agro-nucleotide-transformer-1b", + "model_args":"pretrained=InstaDeepAI/agro-nucleotide-transformer-1b,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"k-mers", "# of Params":"985M", "Pretraining Data":"472.5B Tokens", "Species":"48 Edible Plants", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"InstaDeepAI/agro-nucleotide-transformer-1b", + "model_sha":"main" + }, +"results":{ + "PolyA":{ + "F1":0.7889 + }, + "LncRNA":{ + "F1":0.6724 + }, + "Chrom Acc":{ + "F1":0.6327 + }, + "Prom Str":{ + "RMSE":0.94 + }, + "Term Str":{ + "RMSE":0.78 + }, + "Splice":{ + "F1":0.8845 + }, + "Gene Exp":{ + "RMSE":15.56 + }, + "Enhancer":{ + "F1":0.6283 + } + } +} \ No newline at end of file diff --git a/eval-results/PGB/CDSBERT.json b/eval-results/PGB/CDSBERT.json new file mode 100644 index 0000000000000000000000000000000000000000..92365f0f58bc175190a3ff95cbc24d2093494f09 --- /dev/null +++ b/eval-results/PGB/CDSBERT.json @@ -0,0 +1,45 @@ +{ + "config":{ + "model":"GleghornLab/cdsBERT", + "model_args":"pretrained=GleghornLab/cdsBERT,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"420M", "Pretraining Data":"4M Sequences", "Species":"4,069 RNA families", "Nucleic Acid":"CDS"}, + "model_dtype":"bfloat16", + "model_name":"GleghornLab/cdsBERT", + "model_sha":"main" + }, + "results":{ + "PolyA":{ + "F1":0.3972 + }, + "LncRNA":{ + "F1":0.3306 + }, + "Chrom Acc":{ + "F1":0.4895 + }, + "Prom Str":{ + "RMSE":2.19 + }, + "Term Str":{ + "RMSE":0.59 + }, + "Splice":{ + "F1":0.5220 + }, + "Gene Exp":{ + "RMSE":14.77 + }, + "Enhancer":{ + "F1":0.3393 + } + } +} \ No newline at end of file diff --git a/eval-results/PGB/Caduceus.json b/eval-results/PGB/Caduceus.json new file mode 100644 index 0000000000000000000000000000000000000000..80cfc4a525127aff7dbc5d0a71224d432319bbc6 --- /dev/null +++ b/eval-results/PGB/Caduceus.json @@ -0,0 +1,45 @@ +{ + "config":{ + "model":"kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16", + "model_args":"pretrained= kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"7.73M", "Pretraining Data":"35 billion nucleotide base pairs", "Species":"Human", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16", + "model_sha":"main" + }, + "results":{ + "PolyA":{ + "F1":0.7089 + }, + "LncRNA":{ + "F1":0.6840 + }, + "Chrom Acc":{ + "F1":0.6453 + }, + "Prom Str":{ + "RMSE":0.91 + }, + "Term Str":{ + "RMSE":0.26 + }, + "Splice":{ + "F1":0.7951 + }, + "Gene Exp":{ + "RMSE":14.72 + }, + "Enhancer":{ + "F1":0.6083 + } + } +} \ No newline at end of file diff --git a/eval-results/PGB/DNABERT-2-117M.json b/eval-results/PGB/DNABERT-2-117M.json new file mode 100644 index 0000000000000000000000000000000000000000..3906893d2ce05c38ed29efe43ba07f466bdc0e58 --- /dev/null +++ b/eval-results/PGB/DNABERT-2-117M.json @@ -0,0 +1,45 @@ +{ + "config":{ + "model":"zhihan1996/DNABERT-2-117M", + "model_args":"pretrained=zhihan1996/DNABERT-2-117M,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"BPE", "# of Params":"117M", "Pretraining Data":"32.49B Tokens", "Species":"Human + 135 Species", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"zhihan1996/DNABERT-2-117M", + "model_sha":"main" + }, + "results":{ + "PolyA":{ + "F1":0.4135 + }, + "LncRNA":{ + "F1":0.7255 + }, + "Chrom Acc":{ + "F1":0.6149 + }, + "Prom Str":{ + "RMSE":0.99 + }, + "Term Str":{ + "RMSE":0.24 + }, + "Splice":{ + "F1":0.4534 + }, + "Gene Exp":{ + "RMSE":14.78 + }, + "Enhancer":{ + "F1":0.3640 + } + } +} \ No newline at end of file diff --git a/eval-results/PGB/HyenaDNA.json b/eval-results/PGB/HyenaDNA.json new file mode 100644 index 0000000000000000000000000000000000000000..af033dd0115d50a3bdebed9eb98038b3850d3c41 --- /dev/null +++ b/eval-results/PGB/HyenaDNA.json @@ -0,0 +1,45 @@ +{ + "config":{ + "model":"LongSafari/hyenadna-large-1m-seqlen-hf", + "model_args":"pretrained=LongSafari/hyenadna-large-1m-seqlen-hf,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"47M", "Pretraining Data":"~3.2B Tokens", "Species":"Human", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"LongSafari/hyenadna-large-1m-seqlen-hf", + "model_sha":"main" + }, + "results":{ + "PolyA":{ + "F1":0.8311 + }, + "LncRNA":{ + "F1":0.5821 + }, + "Chrom Acc":{ + "F1":0.5220 + }, + "Prom Str":{ + "RMSE":0.88 + }, + "Term Str":{ + "RMSE":0.26 + }, + "Splice":{ + "F1":0.9028 + }, + "Gene Exp":{ + "RMSE":14.76 + }, + "Enhancer":{ + "F1":0.6617 + } + } +} \ No newline at end of file diff --git a/eval-results/PGB/NT-V2-100M.json b/eval-results/PGB/NT-V2-100M.json new file mode 100644 index 0000000000000000000000000000000000000000..9dbcaf073a15ce2d3e1c89b16d7b65d074e97a89 --- /dev/null +++ b/eval-results/PGB/NT-V2-100M.json @@ -0,0 +1,45 @@ +{ + "config":{ + "model":"InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", + "model_args":"pretrained=InstaDeepAI/nucleotide-transformer-v2-100m-multi-species,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"k-mers", "# of Params":"96M", "Pretraining Data":"300B Tokens", "Species":"Human + 850 Species", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", + "model_sha":"main" + }, + "results":{ + "PolyA":{ + "F1":0.7126 + }, + "LncRNA":{ + "F1":0.7308 + }, + "Chrom Acc":{ + "F1":0.6571 + }, + "Prom Str":{ + "RMSE":0.81 + }, + "Term Str":{ + "RMSE":0.27 + }, + "Splice":{ + "F1":0.9505 + }, + "Gene Exp":{ + "RMSE":14.69 + }, + "Enhancer":{ + "F1":0.7389 + } + } +} \ No newline at end of file diff --git a/eval-results/PGB/OmniGenome186M.json b/eval-results/PGB/OmniGenome186M.json new file mode 100644 index 0000000000000000000000000000000000000000..06b0c51eead662453dcfb7fe0c41ae7a8297046b --- /dev/null +++ b/eval-results/PGB/OmniGenome186M.json @@ -0,0 +1,45 @@ +{ + "config":{ + "model":"yangheng/omnigenome-186M", + "model_args":"pretrained=yangheng/omnigenome-186M,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"186M", "Pretraining Data":"54.2B Tokens", "Species":"1124 Plant Species", "Nucleic Acid":"mRNA, CDS, UTR"}, + "model_dtype":"bfloat16", + "model_name":"yangheng/omnigenome-186M", + "model_sha":"main" + }, + "results":{ + "PolyA":{ + "F1":0.8755 + }, + "LncRNA":{ + "F1":0.7796 + }, + "Chrom Acc":{ + "F1":0.6769 + }, + "Prom Str":{ + "RMSE":0.59 + }, + "Term Str":{ + "RMSE":0.18 + }, + "Splice":{ + "F1":0.9841 + }, + "Gene Exp":{ + "RMSE":14.71 + }, + "Enhancer":{ + "F1":0.7977 + } + } +} \ No newline at end of file diff --git a/eval-results/PGB/RNA-BERT.json b/eval-results/PGB/RNA-BERT.json new file mode 100644 index 0000000000000000000000000000000000000000..dc87b9183fd0d44c57fc5f39596f9670be1d7b88 --- /dev/null +++ b/eval-results/PGB/RNA-BERT.json @@ -0,0 +1,46 @@ +{ + "config":{ + "model":"multimolecule/rnabert", + "model_args":"pretrained=multimolecule/rnabert,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"0.48M", "Pretraining Data":"76,237 human ncRNA sequences", "Species":"Human", "Nucleic Acid":"ncRNA"}, + "model_dtype":"bfloat16", + "model_name":"multimolecule/rnabert", + "model_sha":"main" + }, + "results":{ + "PolyA":{ + "F1":0.7854 + }, + "LncRNA":{ + "F1":0.6199 + }, + "Chrom Acc":{ + "F1":0.4894 + }, + "Prom Str":{ + "RMSE":1.81 + }, + "Term Str":{ + "RMSE":0.38 + }, + "Splice":{ + "F1":0.9445 + }, + "Gene Exp":{ + "RMSE":14.89 + }, + "Enhancer":{ + "F1":0.5761 + } + + } +} \ No newline at end of file diff --git a/eval-results/PGB/RNA-FM.json b/eval-results/PGB/RNA-FM.json new file mode 100644 index 0000000000000000000000000000000000000000..73cc0c1f5125e649358e73058bb3310733f164d3 --- /dev/null +++ b/eval-results/PGB/RNA-FM.json @@ -0,0 +1,45 @@ +{ + "config":{ + "model":"multimolecule/rnafm", + "model_args":"pretrained=multimolecule/rnafm,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"99.52M", "Pretraining Data":"23.7 million non-redundant RNA sequences", "Species":"Multi-Species", "Nucleic Acid":"ncRNA"}, + "model_dtype":"bfloat16", + "model_name":"multimolecule/rnafm", + "model_sha":"main" + }, + "results":{ + "PolyA":{ + "F1":0.8494 + }, + "LncRNA":{ + "F1":0.6875 + }, + "Chrom Acc":{ + "F1":0.5492 + }, + "Prom Str":{ + "RMSE":0.95 + }, + "Term Str":{ + "RMSE":0.27 + }, + "Splice":{ + "F1":0.9595 + }, + "Gene Exp":{ + "RMSE":14.83 + }, + "Enhancer":{ + "F1":0.5714 + } + } +} \ No newline at end of file diff --git a/eval-results/PGB/RNA-MSM.json b/eval-results/PGB/RNA-MSM.json new file mode 100644 index 0000000000000000000000000000000000000000..282ad3a11ed1f200ccd93c78fcd961d94c4364de --- /dev/null +++ b/eval-results/PGB/RNA-MSM.json @@ -0,0 +1,45 @@ +{ + "config":{ + "model":"multimolecule/rnamsm", + "model_args":"pretrained=multimolecule/rnamsm,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"96.5M", "Pretraining Data":"3,932 RNA families", "Species":"Multi-Species", "Nucleic Acid":"RNA"}, + "model_dtype":"bfloat16", + "model_name":"multimolecule/rnamsm", + "model_sha":"main" + }, + "results":{ + "PolyA":{ + "F1":0.8425 + }, + "LncRNA":{ + "F1":0.6749 + }, + "Chrom Acc":{ + "F1":0.5352 + }, + "Prom Str":{ + "RMSE":1.28 + }, + "Term Str":{ + "RMSE":0.28 + }, + "Splice":{ + "F1":0.9549 + }, + "Gene Exp":{ + "RMSE":14.87 + }, + "Enhancer":{ + "F1":0.6145 + } + } +} \ No newline at end of file diff --git a/eval-results/PGB/SpliceBERT.json b/eval-results/PGB/SpliceBERT.json new file mode 100644 index 0000000000000000000000000000000000000000..f52106504f5a96e28e24a29e9ef7019115a7f8ab --- /dev/null +++ b/eval-results/PGB/SpliceBERT.json @@ -0,0 +1,45 @@ +{ + "config":{ + "model":"multimolecule/splicebert", + "model_args":"pretrained=multimolecule/splicebert,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"19.7M", "Pretraining Data":"65 billion nucleotides", "Species":"Multi-Species", "Nucleic Acid":"mRNA"}, + "model_dtype":"bfloat16", + "model_name":"multimolecule/splicebert", + "model_sha":"main" + }, + "results":{ + "PolyA":{ + "F1":0.6523 + }, + "LncRNA":{ + "F1":0.7188 + }, + "Chrom Acc":{ + "F1":0.6362 + }, + "Prom Str":{ + "RMSE":0.75 + }, + "Term Str":{ + "RMSE":0.22 + }, + "Splice":{ + "F1":0.9645 + }, + "Gene Exp":{ + "RMSE":14.70 + }, + "Enhancer":{ + "F1":0.6971 + } + } +} \ No newline at end of file diff --git a/eval-results/RGB/yangheng/3UTRBERT.json b/eval-results/RGB/yangheng/3UTRBERT.json new file mode 100644 index 0000000000000000000000000000000000000000..1adec75827e33722413afddd29d504c61349c815 --- /dev/null +++ b/eval-results/RGB/yangheng/3UTRBERT.json @@ -0,0 +1,39 @@ +{ + "config":{ + "model":"multimolecule/utrbert-4mer", + "model_args":"pretrained=multimolecule/utrbert-4mer,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"k-mers", "# of Params":"86M", "Pretraining Data":"20,362 Sequences", "Species":"Multi-Species", "Nucleic Acid":"mRNA 3'UTR"}, + "model_dtype":"bfloat16", + "model_name":"multimolecule/utrbert-4mer", + "model_sha":"main" + }, + "results":{ + "mRNA":{ + "RMSE":0.7772 + }, + "SNMD":{ + "AUC":0.5002 + }, + "SNMR":{ + "F1":0.2401 + }, + "ArchiveII":{ + "F1":0.7898 + }, + "bpRNA":{ + "F1":0.5693 + }, + "RNAStralign":{ + "F1":0.9203 + } + } +} \ No newline at end of file diff --git a/eval-results/RGB/yangheng/Agro-NT.json b/eval-results/RGB/yangheng/Agro-NT.json new file mode 100644 index 0000000000000000000000000000000000000000..846f30ec63c16f794989b0db1a154c59e1f3697c --- /dev/null +++ b/eval-results/RGB/yangheng/Agro-NT.json @@ -0,0 +1,39 @@ +{ + "config":{ + "model":"InstaDeepAI/agro-nucleotide-transformer-1b", + "model_args":"pretrained=InstaDeepAI/agro-nucleotide-transformer-1b,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"k-mers", "# of Params":"985M", "Pretraining Data":"472.5B Tokens", "Species":"48 Edible Plants", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"InstaDeepAI/agro-nucleotide-transformer-1b", + "model_sha":"main" + }, + "results":{ + "mRNA":{ + "RMSE":0.7830 + }, + "SNMD":{ + "AUC":0.4999 + }, + "SNMR":{ + "F1":0.2638 + }, + "ArchiveII":{ + "F1":0.7013 + }, + "bpRNA":{ + "F1":0.4871 + }, + "RNAStralign":{ + "F1":0.7521 + } + } +} \ No newline at end of file diff --git a/eval-results/RGB/yangheng/CDSBERT.json b/eval-results/RGB/yangheng/CDSBERT.json new file mode 100644 index 0000000000000000000000000000000000000000..3c4b2d0bacaf8c5ada6c773d2f2fbf5ed1b05fa4 --- /dev/null +++ b/eval-results/RGB/yangheng/CDSBERT.json @@ -0,0 +1,39 @@ +{ + "config":{ + "model":"GleghornLab/cdsBERT", + "model_args":"pretrained=GleghornLab/cdsBERT,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"420M", "Pretraining Data":"4M Sequences", "Species":"4,069 RNA families", "Nucleic Acid":"CDS"}, + "model_dtype":"bfloat16", + "model_name":"GleghornLab/cdsBERT", + "model_sha":"main" + }, + "results":{ + "mRNA":{ + "RMSE":0.7468 + }, + "SNMD":{ + "AUC":0.5503 + }, + "SNMR":{ + "F1":0.3616 + }, + "ArchiveII":{ + "F1":0.8934 + }, + "bpRNA":{ + "F1":0.7001 + }, + "RNAStralign":{ + "F1":0.9715 + } + } +} \ No newline at end of file diff --git a/eval-results/RGB/yangheng/DNABERT-2-117M.json b/eval-results/RGB/yangheng/DNABERT-2-117M.json new file mode 100644 index 0000000000000000000000000000000000000000..d5420bb518b4cc9bbfaddbc533c30c0d9f49bb06 --- /dev/null +++ b/eval-results/RGB/yangheng/DNABERT-2-117M.json @@ -0,0 +1,39 @@ +{ + "config":{ + "model":"zhihan1996/DNABERT-2-117M", + "model_args":"pretrained=zhihan1996/DNABERT-2-117M,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"BPE", "# of Params":"117M", "Pretraining Data":"32.49B Tokens", "Species":"Human + 135 Species", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"zhihan1996/DNABERT-2-117M", + "model_sha":"main" + }, + "results":{ + "mRNA":{ + "RMSE":0.8158 + }, + "SNMD":{ + "AUC":0.4994 + }, + "SNMR":{ + "F1":0.1586 + }, + "ArchiveII":{ + "F1":0.5982 + }, + "bpRNA":{ + "F1":0.4340 + }, + "RNAStralign":{ + "F1":0.6549 + } + } +} \ No newline at end of file diff --git a/eval-results/RGB/yangheng/HyenaDNA.json b/eval-results/RGB/yangheng/HyenaDNA.json new file mode 100644 index 0000000000000000000000000000000000000000..64a6a4fc545031813e11311068ee7011518c6410 --- /dev/null +++ b/eval-results/RGB/yangheng/HyenaDNA.json @@ -0,0 +1,39 @@ +{ + "config":{ + "model":"LongSafari/hyenadna-large-1m-seqlen-hf", + "model_args":"pretrained=LongSafari/hyenadna-large-1m-seqlen-hf,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"47M", "Pretraining Data":"~3.2B Tokens", "Species":"Human", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"LongSafari/hyenadna-large-1m-seqlen-hf", + "model_sha":"main" + }, + "results":{ + "mRNA":{ + "RMSE":0.8056 + }, + "SNMD":{ + "AUC":0.5332 + }, + "SNMR":{ + "F1":0.3980 + }, + "ArchiveII":{ + "F1":0.8423 + }, + "bpRNA":{ + "F1":0.5662 + }, + "RNAStralign":{ + "F1":0.9542 + } + } +} \ No newline at end of file diff --git a/eval-results/RGB/yangheng/NT-V2-100M.json b/eval-results/RGB/yangheng/NT-V2-100M.json new file mode 100644 index 0000000000000000000000000000000000000000..f22a639787974b5149a8cd30360c249cf9409922 --- /dev/null +++ b/eval-results/RGB/yangheng/NT-V2-100M.json @@ -0,0 +1,39 @@ +{ + "config":{ + "model":"InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", + "model_args":"pretrained=InstaDeepAI/nucleotide-transformer-v2-100m-multi-species,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"k-mers", "# of Params":"96M", "Pretraining Data":"300B Tokens", "Species":"Human + 850 Species", "Nucleic Acid":"DNA"}, + "model_dtype":"bfloat16", + "model_name":"InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", + "model_sha":"main" + }, + "results":{ + "mRNA":{ + "RMSE":0.7826 + }, + "SNMD":{ + "AUC":0.5049 + }, + "SNMR":{ + "F1":0.2601 + }, + "ArchiveII":{ + "F1":0.7990 + }, + "bpRNA":{ + "F1":0.5660 + }, + "RNAStralign":{ + "F1":0.9084 + } + } +} \ No newline at end of file diff --git a/eval-results/RGB/yangheng/OmniGenome186M.json b/eval-results/RGB/yangheng/OmniGenome186M.json new file mode 100644 index 0000000000000000000000000000000000000000..ec7663dc48867a7bc15c972ae4c11fba5fdb5e37 --- /dev/null +++ b/eval-results/RGB/yangheng/OmniGenome186M.json @@ -0,0 +1,39 @@ +{ + "config":{ + "model":"yangheng/omnigenome-186M", + "model_args":"pretrained=yangheng/omnigenome-186M,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"186M", "Pretraining Data":"54.2B Tokens", "Species":"1124 Plant Species", "Nucleic Acid":"mRNA, CDS, UTR"}, + "model_dtype":"bfloat16", + "model_name":"yangheng/omnigenome-186M", + "model_sha":"main" + }, + "results":{ + "mRNA":{ + "RMSE":0.7164 + }, + "SNMD":{ + "AUC":0.6381 + }, + "SNMR":{ + "F1":0.4980 + }, + "ArchiveII":{ + "F1":0.9520 + }, + "bpRNA":{ + "F1":0.8248 + }, + "RNAStralign":{ + "F1":0.9912 + } + } +} \ No newline at end of file diff --git a/eval-results/RGB/yangheng/SpliceBERT.json b/eval-results/RGB/yangheng/SpliceBERT.json new file mode 100644 index 0000000000000000000000000000000000000000..8ce43c5282635c6aabfd4a5e3b31435f198451af --- /dev/null +++ b/eval-results/RGB/yangheng/SpliceBERT.json @@ -0,0 +1,39 @@ +{ + "config":{ + "model":"multimolecule/splicebert", + "model_args":"pretrained=multimolecule/splicebert,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"19.7M", "Pretraining Data":"65 billion nucleotides", "Species":"Multi-Species", "Nucleic Acid":"mRNA"}, + "model_dtype":"bfloat16", + "model_name":"multimolecule/splicebert", + "model_sha":"main" + }, + "results":{ + "mRNA":{ + "RMSE":0.7340 + }, + "SNMD":{ + "AUC":0.5811 + }, + "SNMR":{ + "F1":0.4644 + }, + "ArchiveII":{ + "F1":0.8905 + }, + "bpRNA":{ + "F1":0.6910 + }, + "RNAStralign":{ + "F1":0.9697 + } + } +} \ No newline at end of file diff --git a/eval-results/RGB/yangheng/results_OmniGenome-52M.json b/eval-results/RGB/yangheng/results_OmniGenome-52M.json new file mode 100644 index 0000000000000000000000000000000000000000..c786546cc3bf47cc0eb0b208341a403f97563f90 --- /dev/null +++ b/eval-results/RGB/yangheng/results_OmniGenome-52M.json @@ -0,0 +1,39 @@ +{ + "config":{ + "model":"yangheng/omnigenome-52M", + "model_args":"pretrained=yangheng/omnigenome-52M,revision=main,dtype=bfloat16", + "num_fewshot":0, + "batch_size":1, + "batch_sizes":[ + + ], + "device":"cpu", + "no_cache":true, + "limit":20, + "bootstrap_iters":100000, + "description_dict":{"Tokenization":"SNT", "# of Params":"52M", "Pretraining Data":"54.2B Tokens", "Species":"1124 Plant Species", "Nucleic Acid":"mRNA, CDS, UTR"}, + "model_dtype":"bfloat16", + "model_name":"yangheng/omnigenome-52M", + "model_sha":"main" + }, + "results":{ + "mRNA":{ + "RMSE":0.7191 + }, + "SNMD":{ + "AUC":0.6244 + }, + "SNMR":{ + "F1":0.4891 + }, + "ArchiveII":{ + "F1":0.9498 + }, + "bpRNA":{ + "F1":0.8234 + }, + "RNAStralign":{ + "F1":0.9901 + } + } +} \ No newline at end of file diff --git a/src/about.py b/src/about.py index 348baefe458b126cc1ca329799718eb5cd21984d..ac3f0503e17d506b729aee323e479d173b9c2c8e 100644 --- a/src/about.py +++ b/src/about.py @@ -111,10 +111,10 @@ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r""" @article{Yang2024, author = {Yang, Heng and Li, Ke}, - title = {Foundation Models Work}, + title = {OmniGenome: Aligning RNA Sequences with Secondary Structures in Genomic Foundation Models}, journal = {arXiv}, year = {2024}, - note = {arXiv preprint arXiv:XXXX.XXXXX} - url = {https://arxiv.org/abs/XXXX.XXXXX} + note = {arXiv preprint arXiv:2407.11242} + url = {https://arxiv.org/abs/2407.11242} } """