yangheng
/

OmniGenome-52M

@@ -6,7 +6,7 @@ language:
 tags:
   - Genomic-Language-Modeling
-  - RNA Genomic Foundation Model
 ---
 # Multi-species Foundation Model for Universal RNA and DNA Downstream Tasks
@@ -15,13 +15,13 @@ tags:
 We are keep updating the checkpoints, the current checkpoint is trained for 0.85 epoch.
 ## Training Examples
-Refer to GitHub [https://github.com/yangheng95/MP-RNA](https://github.com/yangheng95/MP-RNA)
 ## Usage
 This model is available for replacing genomic foundation models such as CDSBERT, Nucleotide Transformers, DNABERT2, etc.
 ```
 from transformers import AutoModel
-model = AutoModel.from_pretrained("yangheng/MPRNA-52M-v1", trust_remote_code=True)
 ```
 ## Subtasks

 tags:
   - Genomic-Language-Modeling
+  - OmniGenome Foundation Model
 ---
 # Multi-species Foundation Model for Universal RNA and DNA Downstream Tasks
 We are keep updating the checkpoints, the current checkpoint is trained for 0.85 epoch.
 ## Training Examples
+Refer to GitHub [https://github.com/yangheng95/OmniGenome](https://github.com/yangheng95/OmniGenome)
 ## Usage
 This model is available for replacing genomic foundation models such as CDSBERT, Nucleotide Transformers, DNABERT2, etc.
 ```
 from transformers import AutoModel
+model = AutoModel.from_pretrained("yangheng/OmniGenome-52M", trust_remote_code=True)
 ```
 ## Subtasks

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:26d7498b3a722bfe09ca9b488b38f315f6696d3f7d6722009cbcfbf4e22480b0
 size 210828112

 version https://git-lfs.github.com/spec/v1
+oid sha256:2300e9ae1743dac0e51fe56eec44718d64b408c349e5f0bc98c567d339fe7938
 size 210828112

modeling_omnigenome.py CHANGED Viewed

@@ -15,8 +15,11 @@
 """ PyTorch OmniGenome model."""
 import math
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
@@ -1117,7 +1120,7 @@ class OmniGenomeForMaskedLM(OmniGenomePreTrainedModel):
         self.OmniGenome = OmniGenomeModel(config, add_pooling_layer=False)
         self.lm_head = OmniGenomeLMHead(config)
-        # self.init_weights()
     def get_output_embeddings(self):
         return self.lm_head.decoder
@@ -1236,8 +1239,9 @@ class OmniGenomeForSequenceClassification(OmniGenomePreTrainedModel):
         self.num_labels = config.num_labels
         self.config = config
         self.OmniGenome = OmniGenomeModel(config, add_pooling_layer=False)
         self.classifier = OmniGenomeClassificationHead(config)
-        # self.init_weights()
     @add_start_docstrings_to_model_forward(
         OmniGenome_INPUTS_DOCSTRING.format("batch_size, sequence_length")
@@ -1279,8 +1283,10 @@ class OmniGenomeForSequenceClassification(OmniGenomePreTrainedModel):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        sequence_output = outputs[0]
-        logits = self.classifier(sequence_output)
         loss = None
         if labels is not None:
@@ -1338,9 +1344,8 @@ class OmniGenomeForTokenClassification(OmniGenomePreTrainedModel):
         self.OmniGenome = OmniGenomeModel(config, add_pooling_layer=False)
         self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
         self.classifier = torch.nn.Linear(self.config.hidden_size, self.num_labels)
-        self.activation = torch.nn.Tanh()
-        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
-        # self.init_weights()
     @add_start_docstrings_to_model_forward(
         OmniGenome_INPUTS_DOCSTRING.format("batch_size, sequence_length")
@@ -1366,12 +1371,12 @@ class OmniGenomeForTokenClassification(OmniGenomePreTrainedModel):
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
         """
         return_dict = (
             return_dict if return_dict is not None else self.config.use_return_dict
         )
-        mlm_outputs = self.OmniGenome(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1381,17 +1386,11 @@ class OmniGenomeForTokenClassification(OmniGenomePreTrainedModel):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        try:
-            last_hidden_state = mlm_outputs[0]
-            last_hidden_state = self.dense(last_hidden_state)
-        except:
-            last_hidden_state = mlm_outputs.hidden_states[-1]
-            last_hidden_state = self.dense(last_hidden_state)
         logits = self.classifier(last_hidden_state)
-        logits = torch.softmax(logits, dim=-1)
-        logits = self.activation(logits)
-        logits = self.dropout(logits)
         loss = None
         if labels is not None:
@@ -1399,14 +1398,14 @@ class OmniGenomeForTokenClassification(OmniGenomePreTrainedModel):
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         if not return_dict:
-            output = (logits,) + mlm_outputs[2:]
             return ((loss,) + output) if loss is not None else output
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
-            hidden_states=mlm_outputs.hidden_states,
-            attentions=mlm_outputs.attentions,
         )
     @staticmethod
@@ -1432,7 +1431,7 @@ class OmniGenomeForTokenClassification(OmniGenomePreTrainedModel):
         return structure
-    def predict_structure(
             self,
             input_ids: Optional[torch.LongTensor] = None,
             attention_mask: Optional[torch.Tensor] = None,
@@ -1457,18 +1456,26 @@ class OmniGenomeForTokenClassification(OmniGenomePreTrainedModel):
 @add_start_docstrings(
     """
-    OmniGenome Model with a simple genetic algorithm based RNA design head on top.
     """,
     OmniGenome_START_DOCSTRING,
 )
-class OmniGenomeMaskedLMForRNADesign(OmniGenomePreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.OmniGenome = OmniGenomeForMaskedLM(config)
         self.num_generation = config.num_generation
         self.num_population = config.num_population
-        # self.init_weights()
     @add_start_docstrings_to_model_forward(
         OmniGenome_INPUTS_DOCSTRING.format("batch_size, sequence_length")
@@ -1494,43 +1501,224 @@ class OmniGenomeMaskedLMForRNADesign(OmniGenomePreTrainedModel):
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
         """
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        outputs = self.OmniGenome(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            labels = labels.to(logits.device)
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
 # Copied from transformers.models.esm.modeling_esm.EsmClassificationHead with Esm->OmniGenome

 """ PyTorch OmniGenome model."""
 import math
+import random
+import warnings
 from typing import List, Optional, Tuple, Union
+import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
         self.OmniGenome = OmniGenomeModel(config, add_pooling_layer=False)
         self.lm_head = OmniGenomeLMHead(config)
+        self.init_weights()
     def get_output_embeddings(self):
         return self.lm_head.decoder
         self.num_labels = config.num_labels
         self.config = config
         self.OmniGenome = OmniGenomeModel(config, add_pooling_layer=False)
+        self.pooler = OmniGenomePooler(config)
         self.classifier = OmniGenomeClassificationHead(config)
+        self.init_weights()
     @add_start_docstrings_to_model_forward(
         OmniGenome_INPUTS_DOCSTRING.format("batch_size, sequence_length")
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
+        last_hidden_state = outputs[0]
+        last_hidden_state = self.dense(last_hidden_state)
+        pooled_output = self.pooler(last_hidden_state)
+        logits = self.classifier(pooled_output)
         loss = None
         if labels is not None:
         self.OmniGenome = OmniGenomeModel(config, add_pooling_layer=False)
         self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
         self.classifier = torch.nn.Linear(self.config.hidden_size, self.num_labels)
+        self.softmax = nn.Softmax(dim=-1)
+        self.init_weights()
     @add_start_docstrings_to_model_forward(
         OmniGenome_INPUTS_DOCSTRING.format("batch_size, sequence_length")
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
         """
         return_dict = (
             return_dict if return_dict is not None else self.config.use_return_dict
         )
+        outputs = self.OmniGenome(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
+        last_hidden_state = outputs[0]
+        last_hidden_state = self.dense(last_hidden_state)
         logits = self.classifier(last_hidden_state)
+        logits = self.softmax(logits)
         loss = None
         if labels is not None:
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         if not return_dict:
+            output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
     @staticmethod
         return structure
+    def predict_rna_structure(
             self,
             input_ids: Optional[torch.LongTensor] = None,
             attention_mask: Optional[torch.Tensor] = None,
 @add_start_docstrings(
     """
+    This is not a standard Seq2Seq model. Instead, this model is designed for RNA design tasks.
+    This is the OmniGenome Model with a simple genetic algorithm based RNA design head on top.
     """,
     OmniGenome_START_DOCSTRING,
 )
+class OmniGenomeModelForSeq2SeqLM(OmniGenomePreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.OmniGenome = OmniGenomeModel(config, add_pooling_layer=False)
+        self.lm_head = OmniGenomeLMHead(config)
         self.num_generation = config.num_generation
         self.num_population = config.num_population
+        self.init_weights()
+        self.tokenizer = None
+        self.predict_structure = None
+        warnings.warn(f"This model {self.__class__.__name__} is not a real Seq2Seq model. "
+                      f"Instead, this model is designed for RNA design tasks")
     @add_start_docstrings_to_model_forward(
         OmniGenome_INPUTS_DOCSTRING.format("batch_size, sequence_length")
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
         """
+        raise NotImplementedError("This model is not designed for standard Seq2Seq tasks. "
+                                  "Use model.rna_sequence_design() for RNA sequences design instead.")
+    def rna_sequence_design(
+            self,
+            structure: str,
+            predict_structure_func=None,
+            **kwargs
+    ) -> List[str]:
+        """
+        Assemble the RNA sequence given the reference sequence structure
+        """
+        if self.tokenizer is None:
+            tokenizer = kwargs.get("tokenizer", None)
+            if tokenizer is None:
+                from transformers import AutoTokenizer
+                self.tokenizer = AutoTokenizer.from_pretrained(self.config.name_or_path)
+            else:
+                self.tokenizer = tokenizer
+        candidates = self.genetic_algorithm_for_rna_design(structure, predict_structure_func=None, **kwargs)
+        return candidates
+    def genetic_algorithm_for_rna_design(self, structure, predict_structure_func=None, **kwargs):
+        if predict_structure_func is None:
+            import ViennaRNA
+            def predict_structure(sequence):
+                return ViennaRNA.fold(sequence)[0]
+            predict_structure_func = predict_structure
+        self.predict_structure = predict_structure_func
+        mutation_ratio = kwargs.get("mutation_ratio", 0.2)
+        num_population = kwargs.get("num_population", self.num_population)
+        num_generation = kwargs.get("num_generation", self.num_generation)
+        import tqdm
+        population = self.init_population(structure, num_population)
+        population = self.mlm_mutate(population, structure, mutation_ratio=mutation_ratio)
+        for generation_id in tqdm.tqdm(range(num_generation), desc="Designing RNA Sequence"):
+            population_fitness = self.sequence_fitness(population, structure)[:num_population]
+            population = sorted(zip(population, population_fitness), key=lambda x: x[1])[:num_population]
+            population = [x[0] for x in population]
+            next_generation = population  # Elitism
+            next_generation += self.crossover(population, structure)
+            next_generation += self.mlm_mutate(next_generation, structure, mutation_ratio)
+            fitness_values = self.sequence_fitness(next_generation, structure)
+            next_generation = sorted(zip(next_generation, fitness_values), key=lambda x: x[1])
+            candidate_sequences = []
+            for sequence, fitness in next_generation:
+                if fitness == 0:
+                    candidate_sequences.append(sequence)
+                else:
+                    break
+            if candidate_sequences:
+                return candidate_sequences
+            print(f"Generation {generation_id}: {next_generation[0][0]} with fitness {next_generation[0][1]}")
+            population = [x[0] for x in next_generation[:num_population]]
+        return []
+    def init_population(self, structure, num_population):
+        # Initialize lists to store population data and inputs for masked language model
+        population = []
+        mlm_inputs = []
+        # Iterate over the number of individuals in the population
+        for _ in range(num_population):  # Changed from self.num_population to num_population
+            # Create a sequence by randomly choosing nucleotides or a mask token for each position in the structure
+            masked_sequence = [
+                random.choice(["A", "G", "C", "T", "<mask>"])
+                for _ in range(len(structure))
+            ]
+            masked_sequence_str = "".join(masked_sequence)
+            mlm_inputs.append(f"{masked_sequence_str}<eos>{''.join(structure)}")
+        # Call a function to predict outputs using the masked language model
+        outputs = self.mlm_predict(mlm_inputs, structure)
+        # Decode the mlm outputs and construct the initial population
+        for i in range(len(outputs)):
+            sequence = self.tokenizer.convert_ids_to_tokens(outputs[i].tolist())
+            fixed_sequence = [
+                x if x in "AGCT" else random.choice(["G", "C"])
+                for x, y in zip(sequence, list(mlm_inputs[i].replace('<mask>', '$')))
+            ]
+            population.append("".join(fixed_sequence))
+        return population
+    def mlm_mutate(self, population, structure, mutation_ratio=0.2):
+        def mutate(sequence, mutation_rate=0.2):
+            sequence = np.array(list(sequence), dtype=np.str_)
+            probability_matrix = np.full(sequence.shape, mutation_rate)
+            masked_indices = np.random.rand(*sequence.shape) < probability_matrix
+            sequence[masked_indices] = "$"
+            mut_seq = "".join(sequence.tolist()).replace("$", "<mask>")
+            return mut_seq
+        def mutate_with_spans_mask(sequence, mutation_rate=0.2):
+            sequence = np.array(list(sequence), dtype=np.str_)
+            length = len(sequence)
+            num_mutations = int(mutation_rate * length)  # Total number of mutations is based on mutation rate
+            # Decide the average span length; we assume mutation spans about 20% of the total mutations length
+            average_span_length = random.randint(1, max(1, int(length * mutation_rate / 10)))
+            # Initialize mutation points
+            mutation_points = np.random.choice(length, num_mutations, replace=False)  # Start points for mutations
+            # Create the mask
+            mask = np.zeros(length, dtype=bool)
+            for start in mutation_points:
+                end = start + average_span_length
+                if end > length:
+                    end = length
+                mask[start:end] = True  # Masking a span from start to end
+            # Apply mask
+            sequence[mask] = "<mask>"
+            # Combine the masked parts with the rest of the sequence
+            mutated_sequence = ''.join(sequence)
+            # Since multiple consecutive '<mask>'s might occur, replace them with a single '<mask>'
+            mutated_sequence = mutated_sequence.replace('<mask>' * average_span_length, '<mask>')
+            return mutated_sequence
+        # Initialize lists to store population data and inputs for masked language model
+        mlm_inputs = []
+        masked_sequences = []
+        # Iterate over the number of individuals in the population
+        for sequence in population:
+            # Create a sequence by randomly choosing nucleotides or a mask token for each position in the structure
+            if random.random() < 1:
+                masked_sequence = mutate(sequence, mutation_ratio)
+            else:
+                masked_sequence = mutate_with_spans_mask(sequence, mutation_ratio)
+            masked_sequences.append(masked_sequence)
+            mlm_inputs.append(f"{masked_sequence}<eos>{''.join(structure)}")
+        # Call a function to predict outputs using the masked language model
+        outputs = self.mlm_predict(mlm_inputs, structure)
+        mut_population = []
+        # Decode the mlm outputs and construct the initial population
+        for i in range(len(outputs)):
+            sequence = self.tokenizer.convert_ids_to_tokens(outputs[i].tolist())
+            fixed_sequence = [
+                x if x in "AGCT" else random.choice(["G", "C"])
+                for x, y in zip(sequence, list(masked_sequences[i].replace('<mask>', '$')))
+            ]
+            mut_population.append("".join(fixed_sequence))
+        return mut_population
+    def crossover(self, population, structure):
+        crossover_population = []
+        batch_crossover_inputs = []
+        for i in range(len(population)):
+            parent1, parent2 = random.choices(population, k=2)
+            pos = random.randint(1, len(parent1) - 1)
+            child1 = parent1[:pos] + "<mask>" * len(parent2[pos:])
+            child2 = "<mask>" * len(parent1[:pos]) + parent2[pos:]
+            batch_crossover_inputs.append(f"{child1}<eos>{structure}")
+            batch_crossover_inputs.append(f"{child2}<eos>{structure}")
+        outputs = self.mlm_predict(batch_crossover_inputs, structure)
+        for i in range(len(outputs)):
+            sequence = self.tokenizer.convert_ids_to_tokens(outputs[i].tolist())
+            fixed_sequence = [
+                x if x in "AGCT" else random.choice(["G", "C"])
+                for x, y in zip(sequence, list(batch_crossover_inputs[i].replace('<mask>', '$')))
+            ]
+            crossover_population.append("".join(fixed_sequence))
+        return crossover_population
+    def sequence_fitness(self, sequences, structure):
+        fitness_values = []
+        structures = [self.predict_structure(sequence) for sequence in sequences]
+        for predicted_structure in structures:
+            scores = []
+            for i in range(len(predicted_structure)):
+                if predicted_structure[i] == structure[i]:
+                    scores.append(1)
+                elif (
+                        predicted_structure[i] == ")"
+                        and structure[i] == "("
+                        or predicted_structure[i] == "("
+                        and structure[i] == ")"
+                ):
+                    scores.append(-3)
+                else:
+                    scores.append(0)
+            score = 1 - sum(scores) / len(structure)
+            fitness_values.append(score)
+        return fitness_values
+    def mlm_predict(self, mlm_inputs, structure):
+        batch_size = 4
+        all_outputs = []
+        from transformers import set_seed
+        set_seed(random.randint(0, 99999999), deterministic=False)
+        with torch.no_grad():
+            for i in range(0, len(mlm_inputs), batch_size):
+                batch_mlm_inputs = self.tokenizer(
+                    mlm_inputs[i:i + batch_size],
+                    padding=True,
+                    max_length=len(mlm_inputs[0]) // 2,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                batch_mlm_inputs = batch_mlm_inputs.to(self.device)
+                outputs = self.OmniGenome(**batch_mlm_inputs)[0]
+                outputs = self.lm_head(outputs)
+                outputs = outputs.argmax(dim=-1)
+                all_outputs.append(outputs)
+        outputs = torch.cat(all_outputs, dim=0)
+        return outputs[:, 1:1 + len(structure)]
 # Copied from transformers.models.esm.modeling_esm.EsmClassificationHead with Esm->OmniGenome