Spaces:
Running
Running
""" | |
File: CodonUtils.py | |
--------------------- | |
Includes constants and helper functions used by other Python scripts. | |
""" | |
import itertools | |
import json | |
import os | |
import pickle | |
import re | |
from abc import ABC, abstractmethod | |
from dataclasses import dataclass | |
from typing import Any, Dict, Iterator, List, Optional, Tuple | |
import pandas as pd | |
import requests | |
import torch | |
# List of all amino acids | |
AMINO_ACIDS: List[str] = [ | |
"A", # Alanine | |
"C", # Cysteine | |
"D", # Aspartic acid | |
"E", # Glutamic acid | |
"F", # Phenylalanine | |
"G", # Glycine | |
"H", # Histidine | |
"I", # Isoleucine | |
"K", # Lysine | |
"L", # Leucine | |
"M", # Methionine | |
"N", # Asparagine | |
"P", # Proline | |
"Q", # Glutamine | |
"R", # Arginine | |
"S", # Serine | |
"T", # Threonine | |
"V", # Valine | |
"W", # Tryptophan | |
"Y", # Tyrosine | |
] | |
STOP_SYMBOLS = ["_", "*"] # Stop codon symbols | |
# Dictionary ambiguous amino acids to standard amino acids | |
AMBIGUOUS_AMINOACID_MAP: Dict[str, list[str]] = { | |
"B": ["N", "D"], # Asparagine (N) or Aspartic acid (D) | |
"Z": ["Q", "E"], # Glutamine (Q) or Glutamic acid (E) | |
"X": ["A"], # Any amino acid (typically replaced with Alanine) | |
"J": ["L", "I"], # Leucine (L) or Isoleucine (I) | |
"U": ["C"], # Selenocysteine (typically replaced with Cysteine) | |
"O": ["K"], # Pyrrolysine (typically replaced with Lysine) | |
} | |
# List of all possible start and stop codons | |
START_CODONS: List[str] = ["ATG", "TTG", "CTG", "GTG"] | |
STOP_CODONS: List[str] = ["TAA", "TAG", "TGA"] | |
# Token-to-index mapping for amino acids and special tokens | |
TOKEN2INDEX: Dict[str, int] = { | |
"[UNK]": 0, | |
"[CLS]": 1, | |
"[SEP]": 2, | |
"[PAD]": 3, | |
"[MASK]": 4, | |
"a_unk": 5, | |
"c_unk": 6, | |
"d_unk": 7, | |
"e_unk": 8, | |
"f_unk": 9, | |
"g_unk": 10, | |
"h_unk": 11, | |
"i_unk": 12, | |
"k_unk": 13, | |
"l_unk": 14, | |
"m_unk": 15, | |
"n_unk": 16, | |
"p_unk": 17, | |
"q_unk": 18, | |
"r_unk": 19, | |
"s_unk": 20, | |
"t_unk": 21, | |
"v_unk": 22, | |
"w_unk": 23, | |
"y_unk": 24, | |
"__unk": 25, | |
"k_aaa": 26, | |
"n_aac": 27, | |
"k_aag": 28, | |
"n_aat": 29, | |
"t_aca": 30, | |
"t_acc": 31, | |
"t_acg": 32, | |
"t_act": 33, | |
"r_aga": 34, | |
"s_agc": 35, | |
"r_agg": 36, | |
"s_agt": 37, | |
"i_ata": 38, | |
"i_atc": 39, | |
"m_atg": 40, | |
"i_att": 41, | |
"q_caa": 42, | |
"h_cac": 43, | |
"q_cag": 44, | |
"h_cat": 45, | |
"p_cca": 46, | |
"p_ccc": 47, | |
"p_ccg": 48, | |
"p_cct": 49, | |
"r_cga": 50, | |
"r_cgc": 51, | |
"r_cgg": 52, | |
"r_cgt": 53, | |
"l_cta": 54, | |
"l_ctc": 55, | |
"l_ctg": 56, | |
"l_ctt": 57, | |
"e_gaa": 58, | |
"d_gac": 59, | |
"e_gag": 60, | |
"d_gat": 61, | |
"a_gca": 62, | |
"a_gcc": 63, | |
"a_gcg": 64, | |
"a_gct": 65, | |
"g_gga": 66, | |
"g_ggc": 67, | |
"g_ggg": 68, | |
"g_ggt": 69, | |
"v_gta": 70, | |
"v_gtc": 71, | |
"v_gtg": 72, | |
"v_gtt": 73, | |
"__taa": 74, | |
"y_tac": 75, | |
"__tag": 76, | |
"y_tat": 77, | |
"s_tca": 78, | |
"s_tcc": 79, | |
"s_tcg": 80, | |
"s_tct": 81, | |
"__tga": 82, | |
"c_tgc": 83, | |
"w_tgg": 84, | |
"c_tgt": 85, | |
"l_tta": 86, | |
"f_ttc": 87, | |
"l_ttg": 88, | |
"f_ttt": 89, | |
} | |
# Index-to-token mapping, reverse of TOKEN2INDEX | |
INDEX2TOKEN: Dict[int, str] = {i: c for c, i in TOKEN2INDEX.items()} | |
# Dictionary mapping each codon to its GC content | |
CODON_GC_CONTENT: Dict[str, int] = { | |
token.split("_")[1]: token.split("_")[1].upper().count("G") + token.split("_")[1].upper().count("C") | |
for token in TOKEN2INDEX | |
if "_" in token and len(token.split("_")[1]) == 3 | |
} | |
# Tensor with GC counts for each token in the vocabulary | |
GC_COUNTS_PER_TOKEN = torch.zeros(len(TOKEN2INDEX)) | |
for token, index in TOKEN2INDEX.items(): | |
if "_" in token and len(token.split("_")[1]) == 3: | |
codon = token.split("_")[1].upper() | |
gc_count = codon.count("G") + codon.count("C") | |
GC_COUNTS_PER_TOKEN[index] = gc_count | |
G_indices = [idx for token, idx in TOKEN2INDEX.items() if "g" in token.split("_")[-1]] | |
C_indices = [idx for token, idx in TOKEN2INDEX.items() if "c" in token.split("_")[-1]] | |
# Dictionary mapping each amino acid and stop symbol to indices of codon tokens that translate to it | |
AMINO_ACID_TO_INDEX = { | |
aa: sorted( | |
[i for t, i in TOKEN2INDEX.items() if t[0].upper() == aa and t[-3:] != "unk"] | |
) | |
for aa in (AMINO_ACIDS + STOP_SYMBOLS) | |
} | |
# Dictionary mapping each amino acid to min/max GC content across all possible codons | |
AA_MIN_GC: Dict[str, int] = {} | |
AA_MAX_GC: Dict[str, int] = {} | |
for aa, token_indices in AMINO_ACID_TO_INDEX.items(): | |
if token_indices: # Skip if no tokens for this amino acid | |
gc_counts = [] | |
for token_idx in token_indices: | |
token = INDEX2TOKEN[token_idx] | |
if "_" in token and len(token.split("_")[1]) == 3: | |
codon = token.split("_")[1] | |
if codon in CODON_GC_CONTENT: | |
gc_counts.append(CODON_GC_CONTENT[codon]) | |
if gc_counts: | |
AA_MIN_GC[aa] = min(gc_counts) | |
AA_MAX_GC[aa] = max(gc_counts) | |
# Mask token mapping | |
TOKEN2MASK: Dict[int, int] = { | |
0: 0, | |
1: 1, | |
2: 2, | |
3: 3, | |
4: 4, | |
5: 5, | |
6: 6, | |
7: 7, | |
8: 8, | |
9: 9, | |
10: 10, | |
11: 11, | |
12: 12, | |
13: 13, | |
14: 14, | |
15: 15, | |
16: 16, | |
17: 17, | |
18: 18, | |
19: 19, | |
20: 20, | |
21: 21, | |
22: 22, | |
23: 23, | |
24: 24, | |
25: 25, | |
26: 13, | |
27: 16, | |
28: 13, | |
29: 16, | |
30: 21, | |
31: 21, | |
32: 21, | |
33: 21, | |
34: 19, | |
35: 20, | |
36: 19, | |
37: 20, | |
38: 12, | |
39: 12, | |
40: 15, | |
41: 12, | |
42: 18, | |
43: 11, | |
44: 18, | |
45: 11, | |
46: 17, | |
47: 17, | |
48: 17, | |
49: 17, | |
50: 19, | |
51: 19, | |
52: 19, | |
53: 19, | |
54: 14, | |
55: 14, | |
56: 14, | |
57: 14, | |
58: 8, | |
59: 7, | |
60: 8, | |
61: 7, | |
62: 5, | |
63: 5, | |
64: 5, | |
65: 5, | |
66: 10, | |
67: 10, | |
68: 10, | |
69: 10, | |
70: 22, | |
71: 22, | |
72: 22, | |
73: 22, | |
74: 25, | |
75: 24, | |
76: 25, | |
77: 24, | |
78: 20, | |
79: 20, | |
80: 20, | |
81: 20, | |
82: 25, | |
83: 6, | |
84: 23, | |
85: 6, | |
86: 14, | |
87: 9, | |
88: 14, | |
89: 9, | |
} | |
# List of organisms used for fine-tuning | |
FINE_TUNE_ORGANISMS: List[str] = [ | |
"Arabidopsis thaliana", | |
"Bacillus subtilis", | |
"Caenorhabditis elegans", | |
"Chlamydomonas reinhardtii", | |
"Chlamydomonas reinhardtii chloroplast", | |
"Danio rerio", | |
"Drosophila melanogaster", | |
"Homo sapiens", | |
"Mus musculus", | |
"Nicotiana tabacum", | |
"Nicotiana tabacum chloroplast", | |
"Pseudomonas putida", | |
"Saccharomyces cerevisiae", | |
"Escherichia coli O157-H7 str. Sakai", | |
"Escherichia coli general", | |
"Escherichia coli str. K-12 substr. MG1655", | |
"Thermococcus barophilus MPT", | |
] | |
# List of organisms most commonly used for coodn optimization | |
COMMON_ORGANISMS: List[str] = [ | |
"Arabidopsis thaliana", | |
"Bacillus subtilis", | |
"Caenorhabditis elegans", | |
"Chlamydomonas reinhardtii", | |
"Danio rerio", | |
"Drosophila melanogaster", | |
"Homo sapiens", | |
"Mus musculus", | |
"Nicotiana tabacum", | |
"Pseudomonas putida", | |
"Saccharomyces cerevisiae", | |
"Escherichia coli general", | |
] | |
# Dictionary mapping each organism name to respective organism id | |
ORGANISM2ID: Dict[str, int] = { | |
"Arabidopsis thaliana": 0, | |
"Atlantibacter hermannii": 1, | |
"Bacillus subtilis": 2, | |
"Brenneria goodwinii": 3, | |
"Buchnera aphidicola (Schizaphis graminum)": 4, | |
"Caenorhabditis elegans": 5, | |
"Candidatus Erwinia haradaeae": 6, | |
"Candidatus Hamiltonella defensa 5AT (Acyrthosiphon pisum)": 7, | |
"Chlamydomonas reinhardtii": 8, | |
"Chlamydomonas reinhardtii chloroplast": 9, | |
"Citrobacter amalonaticus": 10, | |
"Citrobacter braakii": 11, | |
"Citrobacter cronae": 12, | |
"Citrobacter europaeus": 13, | |
"Citrobacter farmeri": 14, | |
"Citrobacter freundii": 15, | |
"Citrobacter koseri ATCC BAA-895": 16, | |
"Citrobacter portucalensis": 17, | |
"Citrobacter werkmanii": 18, | |
"Citrobacter youngae": 19, | |
"Cronobacter dublinensis subsp. dublinensis LMG 23823": 20, | |
"Cronobacter malonaticus LMG 23826": 21, | |
"Cronobacter sakazakii": 22, | |
"Cronobacter turicensis": 23, | |
"Danio rerio": 24, | |
"Dickeya dadantii 3937": 25, | |
"Dickeya dianthicola": 26, | |
"Dickeya fangzhongdai": 27, | |
"Dickeya solani": 28, | |
"Dickeya zeae": 29, | |
"Drosophila melanogaster": 30, | |
"Edwardsiella anguillarum ET080813": 31, | |
"Edwardsiella ictaluri": 32, | |
"Edwardsiella piscicida": 33, | |
"Edwardsiella tarda": 34, | |
"Enterobacter asburiae": 35, | |
"Enterobacter bugandensis": 36, | |
"Enterobacter cancerogenus": 37, | |
"Enterobacter chengduensis": 38, | |
"Enterobacter cloacae": 39, | |
"Enterobacter hormaechei": 40, | |
"Enterobacter kobei": 41, | |
"Enterobacter ludwigii": 42, | |
"Enterobacter mori": 43, | |
"Enterobacter quasiroggenkampii": 44, | |
"Enterobacter roggenkampii": 45, | |
"Enterobacter sichuanensis": 46, | |
"Erwinia amylovora CFBP1430": 47, | |
"Erwinia persicina": 48, | |
"Escherichia albertii": 49, | |
"Escherichia coli O157-H7 str. Sakai": 50, | |
"Escherichia coli general": 51, | |
"Escherichia coli str. K-12 substr. MG1655": 52, | |
"Escherichia fergusonii": 53, | |
"Escherichia marmotae": 54, | |
"Escherichia ruysiae": 55, | |
"Ewingella americana": 56, | |
"Hafnia alvei": 57, | |
"Hafnia paralvei": 58, | |
"Homo sapiens": 59, | |
"Kalamiella piersonii": 60, | |
"Klebsiella aerogenes": 61, | |
"Klebsiella grimontii": 62, | |
"Klebsiella michiganensis": 63, | |
"Klebsiella oxytoca": 64, | |
"Klebsiella pasteurii": 65, | |
"Klebsiella pneumoniae subsp. pneumoniae HS11286": 66, | |
"Klebsiella quasipneumoniae": 67, | |
"Klebsiella quasivariicola": 68, | |
"Klebsiella variicola": 69, | |
"Kosakonia cowanii": 70, | |
"Kosakonia radicincitans": 71, | |
"Leclercia adecarboxylata": 72, | |
"Lelliottia amnigena": 73, | |
"Lonsdalea populi": 74, | |
"Moellerella wisconsensis": 75, | |
"Morganella morganii": 76, | |
"Mus musculus": 77, | |
"Nicotiana tabacum": 78, | |
"Nicotiana tabacum chloroplast": 79, | |
"Obesumbacterium proteus": 80, | |
"Pantoea agglomerans": 81, | |
"Pantoea allii": 82, | |
"Pantoea ananatis PA13": 83, | |
"Pantoea dispersa": 84, | |
"Pantoea stewartii": 85, | |
"Pantoea vagans": 86, | |
"Pectobacterium aroidearum": 87, | |
"Pectobacterium atrosepticum": 88, | |
"Pectobacterium brasiliense": 89, | |
"Pectobacterium carotovorum": 90, | |
"Pectobacterium odoriferum": 91, | |
"Pectobacterium parmentieri": 92, | |
"Pectobacterium polaris": 93, | |
"Pectobacterium versatile": 94, | |
"Photorhabdus laumondii subsp. laumondii TTO1": 95, | |
"Plesiomonas shigelloides": 96, | |
"Pluralibacter gergoviae": 97, | |
"Proteus faecis": 98, | |
"Proteus mirabilis HI4320": 99, | |
"Proteus penneri": 100, | |
"Proteus terrae subsp. cibarius": 101, | |
"Proteus vulgaris": 102, | |
"Providencia alcalifaciens": 103, | |
"Providencia heimbachae": 104, | |
"Providencia rettgeri": 105, | |
"Providencia rustigianii": 106, | |
"Providencia stuartii": 107, | |
"Providencia thailandensis": 108, | |
"Pseudomonas putida": 109, | |
"Pyrococcus furiosus": 110, | |
"Pyrococcus horikoshii": 111, | |
"Pyrococcus yayanosii": 112, | |
"Rahnella aquatilis CIP 78.65 = ATCC 33071": 113, | |
"Raoultella ornithinolytica": 114, | |
"Raoultella planticola": 115, | |
"Raoultella terrigena": 116, | |
"Rosenbergiella epipactidis": 117, | |
"Rouxiella badensis": 118, | |
"Saccharolobus solfataricus": 119, | |
"Saccharomyces cerevisiae": 120, | |
"Salmonella bongori N268-08": 121, | |
"Salmonella enterica subsp. enterica serovar Typhimurium str. LT2": 122, | |
"Serratia bockelmannii": 123, | |
"Serratia entomophila": 124, | |
"Serratia ficaria": 125, | |
"Serratia fonticola": 126, | |
"Serratia grimesii": 127, | |
"Serratia liquefaciens": 128, | |
"Serratia marcescens": 129, | |
"Serratia nevei": 130, | |
"Serratia plymuthica AS9": 131, | |
"Serratia proteamaculans": 132, | |
"Serratia quinivorans": 133, | |
"Serratia rubidaea": 134, | |
"Serratia ureilytica": 135, | |
"Shigella boydii": 136, | |
"Shigella dysenteriae": 137, | |
"Shigella flexneri 2a str. 301": 138, | |
"Shigella sonnei": 139, | |
"Thermoccoccus kodakarensis": 140, | |
"Thermococcus barophilus MPT": 141, | |
"Thermococcus chitonophagus": 142, | |
"Thermococcus gammatolerans": 143, | |
"Thermococcus litoralis": 144, | |
"Thermococcus onnurineus": 145, | |
"Thermococcus sibiricus": 146, | |
"Xenorhabdus bovienii str. feltiae Florida": 147, | |
"Yersinia aldovae 670-83": 148, | |
"Yersinia aleksiciae": 149, | |
"Yersinia alsatica": 150, | |
"Yersinia enterocolitica": 151, | |
"Yersinia frederiksenii ATCC 33641": 152, | |
"Yersinia intermedia": 153, | |
"Yersinia kristensenii": 154, | |
"Yersinia massiliensis CCUG 53443": 155, | |
"Yersinia mollaretii ATCC 43969": 156, | |
"Yersinia pestis A1122": 157, | |
"Yersinia proxima": 158, | |
"Yersinia pseudotuberculosis IP 32953": 159, | |
"Yersinia rochesterensis": 160, | |
"Yersinia rohdei": 161, | |
"Yersinia ruckeri": 162, | |
"Yokenella regensburgei": 163, | |
} | |
# Dictionary mapping each organism id to respective organism name | |
ID2ORGANISM = {v: k for k, v in ORGANISM2ID.items()} | |
# Type alias for amino acid to codon mapping | |
AMINO2CODON_TYPE = Dict[str, Tuple[List[str], List[float]]] | |
# Constants for the number of organisms and sequence lengths | |
NUM_ORGANISMS = 164 | |
MAX_LEN = 2048 | |
MAX_AMINO_ACIDS = MAX_LEN - 2 # Without special tokens [CLS] and [SEP] | |
STOP_SYMBOL = "_" | |
class DNASequencePrediction: | |
""" | |
A class to hold the output of the DNA sequence prediction. | |
Attributes: | |
organism (str): Name of the organism used for prediction. | |
protein (str): Input protein sequence for which DNA sequence is predicted. | |
processed_input (str): Processed input sequence (merged protein and DNA). | |
predicted_dna (str): Predicted DNA sequence. | |
""" | |
organism: str | |
protein: str | |
processed_input: str | |
predicted_dna: str | |
class IterableData(torch.utils.data.IterableDataset): | |
""" | |
Defines the logic for iterable datasets (working over streams of | |
data) in parallel multi-processing environments, e.g., multi-GPU. | |
Args: | |
dist_env (Optional[str]): The distribution environment identifier | |
(e.g., "slurm"). | |
Credit: Guillaume Filion | |
""" | |
def __init__(self, dist_env: Optional[str] = None): | |
super().__init__() | |
if dist_env is None: | |
self.world_size_handle, self.rank_handle = ("WORLD_SIZE", "LOCAL_RANK") | |
else: | |
self.world_size_handle, self.rank_handle = { | |
"slurm": ("SLURM_NTASKS", "SLURM_PROCID") | |
}.get(dist_env, ("WORLD_SIZE", "LOCAL_RANK")) | |
def iterator(self) -> Iterator: | |
"""Define the stream logic for the dataset. Implement in subclasses.""" | |
raise NotImplementedError | |
def __iter__(self) -> Iterator: | |
""" | |
Create an iterator for the dataset, handling multi-processing contexts. | |
Returns: | |
Iterator: The iterator for the dataset. | |
""" | |
worker_info = torch.utils.data.get_worker_info() | |
if worker_info is None: | |
return self.iterator | |
# In multi-processing context, use 'os.environ' to | |
# find global worker rank. Then use 'islice' to allocate | |
# the items of the stream to the workers. | |
world_size = int(os.environ.get(self.world_size_handle, "1")) | |
global_rank = int(os.environ.get(self.rank_handle, "0")) | |
local_rank = worker_info.id | |
local_num_workers = worker_info.num_workers | |
# Assume that each process has the same number of local workers. | |
worker_rk = global_rank * local_num_workers + local_rank | |
worker_nb = world_size * local_num_workers | |
return itertools.islice(self.iterator, worker_rk, None, worker_nb) | |
class IterableJSONData(IterableData): | |
""" | |
Iterate over the lines of a JSON file and uncompress if needed. | |
Args: | |
data_path (str): The path to the JSON data file. | |
train (bool): Flag indicating if the dataset is for training. | |
**kwargs: Additional keyword arguments for the base class. | |
""" | |
def __init__(self, data_path: str, train: bool = True, **kwargs): | |
super().__init__(**kwargs) | |
self.data_path = data_path | |
self.train = train | |
with open(os.path.join(self.data_path, "finetune_set.json"), "r") as f: | |
self.records = [json.loads(line) for line in f] | |
def __len__(self): | |
return len(self.records) | |
def iterator(self) -> Iterator: | |
"""Define the stream logic for the dataset.""" | |
for record in self.records: | |
yield record | |
class ConfigManager(ABC): | |
""" | |
Abstract base class for managing configuration settings. | |
""" | |
_config: Dict[str, Any] | |
def __enter__(self): | |
return self | |
def __exit__(self, exc_type, exc_value, traceback): | |
if exc_type is not None: | |
print(f"Exception occurred: {exc_type}, {exc_value}, {traceback}") | |
self.reset_config() | |
def reset_config(self) -> None: | |
"""Reset the configuration to default values.""" | |
pass | |
def get(self, key: str) -> Any: | |
""" | |
Get the value of a configuration key. | |
Args: | |
key (str): The key to retrieve the value for. | |
Returns: | |
Any: The value of the configuration key. | |
""" | |
return self._config.get(key) | |
def set(self, key: str, value: Any) -> None: | |
""" | |
Set the value of a configuration key. | |
Args: | |
key (str): The key to set the value for. | |
value (Any): The value to set for the key. | |
""" | |
self.validate_inputs(key, value) | |
self._config[key] = value | |
def update(self, config_dict: dict) -> None: | |
""" | |
Update the configuration with a dictionary of key-value pairs after validating them. | |
Args: | |
config_dict (dict): A dictionary of key-value pairs to update the configuration. | |
""" | |
for key, value in config_dict.items(): | |
self.validate_inputs(key, value) | |
self._config.update(config_dict) | |
def validate_inputs(self, key: str, value: Any) -> None: | |
"""Validate the inputs for the configuration.""" | |
pass | |
class ProteinConfig(ConfigManager): | |
""" | |
A class to manage configuration settings for protein sequences. | |
This class ensures that the configuration is a singleton. | |
It provides methods to get, set, and update configuration values. | |
Attributes: | |
_instance (Optional[ConfigManager]): The singleton instance of the ConfigManager. | |
_config (Dict[str, Any]): The configuration dictionary. | |
""" | |
_instance = None | |
def __new__(cls): | |
""" | |
Create a new instance of the ProteinConfig class. | |
Returns: | |
ProteinConfig: The singleton instance of the ProteinConfig. | |
""" | |
if cls._instance is None: | |
cls._instance = super(ProteinConfig, cls).__new__(cls) | |
cls._instance.reset_config() | |
return cls._instance | |
def validate_inputs(self, key: str, value: Any) -> None: | |
""" | |
Validate the inputs for the configuration. | |
Args: | |
key (str): The key to validate. | |
value (Any): The value to validate. | |
Raises: | |
ValueError: If the value is invalid. | |
TypeError: If the value is of the wrong type. | |
""" | |
if key == "ambiguous_aminoacid_behavior": | |
if value not in [ | |
"raise_error", | |
"standardize_deterministic", | |
"standardize_random", | |
]: | |
raise ValueError( | |
f"Invalid value for ambiguous_aminoacid_behavior: {value}." | |
) | |
elif key == "ambiguous_aminoacid_map_override": | |
if not isinstance(value, dict): | |
raise TypeError( | |
f"Invalid type for ambiguous_aminoacid_map_override: {value}." | |
) | |
for ambiguous_aminoacid, aminoacids in value.items(): | |
if not isinstance(aminoacids, list): | |
raise TypeError(f"Invalid type for aminoacids: {aminoacids}.") | |
if not aminoacids: | |
raise ValueError( | |
f"Override for aminoacid '{ambiguous_aminoacid}' cannot be empty list." | |
) | |
if ambiguous_aminoacid not in AMBIGUOUS_AMINOACID_MAP: | |
raise ValueError( | |
f"Invalid amino acid in ambiguous_aminoacid_map_override: {ambiguous_aminoacid}" | |
) | |
else: | |
raise ValueError(f"Invalid configuration key: {key}") | |
def reset_config(self) -> None: | |
""" | |
Reset the configuration to the default values. | |
""" | |
self._config = { | |
"ambiguous_aminoacid_behavior": "standardize_random", | |
"ambiguous_aminoacid_map_override": {}, | |
} | |
def load_python_object_from_disk(file_path: str) -> Any: | |
""" | |
Load a Pickle object from disk and return it as a Python object. | |
Args: | |
file_path (str): The path to the Pickle file. | |
Returns: | |
Any: The loaded Python object. | |
""" | |
with open(file_path, "rb") as file: | |
return pickle.load(file) | |
def save_python_object_to_disk(input_object: Any, file_path: str) -> None: | |
""" | |
Save a Python object to disk using Pickle. | |
Args: | |
input_object (Any): The Python object to save. | |
file_path (str): The path where the object will be saved. | |
""" | |
with open(file_path, "wb") as file: | |
pickle.dump(input_object, file) | |
def find_pattern_in_fasta(keyword: str, text: str) -> str: | |
""" | |
Find a specific keyword pattern in text. Helpful for identifying parts | |
of a FASTA sequence. | |
Args: | |
keyword (str): The keyword pattern to search for. | |
text (str): The text to search within. | |
Returns: | |
str: The found pattern or an empty string if not found. | |
""" | |
# Search for the keyword pattern in the text using regex | |
result = re.search(keyword + r"=(.*?)]", text) | |
return result.group(1) if result else "" | |
def get_organism2id_dict(organism_reference: str) -> Dict[str, int]: | |
""" | |
Return a dictionary mapping each organism in training data to an index | |
used for training. | |
Args: | |
organism_reference (str): Path to a CSV file containing a list of | |
all organisms. The format of the CSV file should be as follows: | |
0,Escherichia coli | |
1,Homo sapiens | |
2,Mus musculus | |
Returns: | |
Dict[str, int]: Dictionary mapping organism names to their respective indices. | |
""" | |
# Read the CSV file and create a dictionary mapping organisms to their indices | |
organisms = pd.read_csv(organism_reference, index_col=0, header=None) | |
organism2id = {organisms.iloc[i].values[0]: i for i in organisms.index} | |
return organism2id | |
def get_taxonomy_id( | |
taxonomy_reference: str, organism: Optional[str] = None, return_dict: bool = False | |
) -> Any: | |
""" | |
Return the taxonomy id of a given organism using a reference file. | |
Optionally, return the whole dictionary instead if return_dict is True. | |
Args: | |
taxonomy_reference (str): Path to the taxonomy reference file. | |
organism (Optional[str]): The name of the organism to look up. | |
return_dict (bool): Whether to return the entire dictionary. | |
Returns: | |
Any: The taxonomy id of the organism or the entire dictionary. | |
""" | |
# Load the organism-to-taxonomy mapping from a Pickle file | |
organism2taxonomy = load_python_object_from_disk(taxonomy_reference) | |
if return_dict: | |
return dict(sorted(organism2taxonomy.items())) | |
return organism2taxonomy[organism] | |
def sort_amino2codon_skeleton(amino2codon: Dict[str, Any]) -> Dict[str, Any]: | |
""" | |
Sort the amino2codon dictionary alphabetically by amino acid and by codon name. | |
Args: | |
amino2codon (Dict[str, Any]): The amino2codon dictionary to sort. | |
Returns: | |
Dict[str, Any]: The sorted amino2codon dictionary. | |
""" | |
# Sort the dictionary by amino acid and then by codon name | |
amino2codon = dict(sorted(amino2codon.items())) | |
amino2codon = { | |
amino: ( | |
[codon for codon, _ in sorted(zip(codons, frequencies))], | |
[freq for _, freq in sorted(zip(codons, frequencies))], | |
) | |
for amino, (codons, frequencies) in amino2codon.items() | |
} | |
return amino2codon | |
def load_pkl_from_url(url: str) -> Any: | |
""" | |
Download a Pickle file from a URL and return the loaded object. | |
Args: | |
url (str): The URL to download the Pickle file from. | |
Returns: | |
Any: The loaded Python object from the Pickle file. | |
""" | |
response = requests.get(url) | |
response.raise_for_status() # Ensure the request was successful | |
# Load the Pickle object from the response content | |
return pickle.loads(response.content) | |