multimolecule
/

calm

@@ -10,19 +10,19 @@ library_name: multimolecule
 pipeline_tag: fill-mask
 mask_token: "<mask>"
 widget:
-  - example_title: "PRNP"
-    text: "CTG<mask>AAGCGGCCCACGCGGACTGACGGGCGGGGG"
     output:
-      - label: "GUG"
-        score: 0.010724939405918121
       - label: "GNC"
-        score: 0.010476444847881794
-      - label: "AUC"
-        score: 0.010415051132440567
-      - label: "GGG"
-        score: 0.010389575734734535
-      - label: "AAU"
-        score: 0.01017767284065485
 ---
 # CaLM
@@ -75,7 +75,7 @@ CaLM is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-style mod
 - **Paper**: [Codon language embeddings provide strong signals for use in protein engineering](https://doi.org/10.1101/2022.12.15.519894)
 - **Developed by**: Carlos Outeiral, Charlotte M. Deane
 - **Model type**: [BERT](https://huggingface.co/google-bert/bert-base-uncased) - [ESM](https://huggingface.co/facebook/esm2_t48_15B_UR50D)
-- **Original Repository**: [https://github.com/oxpig/CaLM](https://github.com/oxpig/CaLM)
 ## Usage
@@ -92,29 +92,29 @@ You can use this model directly with a pipeline for masked language modeling:
 ```python
 >>> import multimolecule  # you must import multimolecule to register models
 >>> from transformers import pipeline
->>> unmasker = pipeline('fill-mask', model='multimolecule/calm')
->>> unmasker("ctg<mask>aagcggcccacgcggactgacgggcggggg")
-[{'score': 0.010724939405918121,
-  'token': 73,
-  'token_str': 'GUG',
-  'sequence': 'CUG GUG AAG CGG CCC ACG CGG ACU GAC GGG CGG GGG'},
- {'score': 0.010476444847881794,
   'token': 77,
   'token_str': 'GNC',
-  'sequence': 'CUG GNC AAG CGG CCC ACG CGG ACU GAC GGG CGG GGG'},
- {'score': 0.010415051132440567,
-  'token': 22,
-  'token_str': 'AUC',
-  'sequence': 'CUG AUC AAG CGG CCC ACG CGG ACU GAC GGG CGG GGG'},
- {'score': 0.010389575734734535,
-  'token': 68,
-  'token_str': 'GGG',
-  'sequence': 'CUG GGG AAG CGG CCC ACG CGG ACU GAC GGG CGG GGG'},
- {'score': 0.01017767284065485,
-  'token': 9,
-  'token_str': 'AAU',
-  'sequence': 'CUG AAU AAG CGG CCC ACG CGG ACU GAC GGG CGG GGG'}]
 ```
 ### Downstream Use
@@ -127,11 +127,11 @@ Here is how to use this model to get the features of a given sequence in PyTorch
 from multimolecule import RnaTokenizer, CaLmModel
-tokenizer = RnaTokenizer.from_pretrained('multimolecule/calm')
-model = CaLmModel.from_pretrained('multimolecule/calm')
 text = "GCCAGTCGCTGACAGCCGCGG"
-input = tokenizer(text, return_tensors='pt')
 output = model(**input)
 ```
@@ -147,17 +147,17 @@ import torch
 from multimolecule import RnaTokenizer, CaLmForSequencePrediction
-tokenizer = RnaTokenizer.from_pretrained('multimolecule/calm')
-model = CaLmForSequencePrediction.from_pretrained('multimolecule/calm')
 text = "GCCAGTCGCTGACAGCCGCGG"
-input = tokenizer(text, return_tensors='pt')
 label = torch.tensor([1])
 output = model(**input, labels=label)
 ```
-#### Nucleotide Classification / Regression
 **Note**: This model is not fine-tuned for any specific task. You will need to fine-tune the model on a downstream task to use it for nucleotide classification or regression.
@@ -165,14 +165,14 @@ Here is how to use this model as backbone to fine-tune for a nucleotide-level ta
 ```python
 import torch
-from multimolecule import RnaTokenizer, CaLmForNucleotidePrediction
-tokenizer = RnaTokenizer.from_pretrained('multimolecule/calm')
-model = CaLmForNucleotidePrediction.from_pretrained('multimolecule/calm')
 text = "GCCAGTCGCTGACAGCCGCGG"
-input = tokenizer(text, return_tensors='pt')
 label = torch.randint(2, (len(text), ))
 output = model(**input, labels=label)
@@ -189,11 +189,11 @@ import torch
 from multimolecule import RnaTokenizer, CaLmForContactPrediction
-tokenizer = RnaTokenizer.from_pretrained('multimolecule/calm')
-model = CaLmForContactPrediction.from_pretrained('multimolecule/calm')
 text = "GCCAGTCGCTGACAGCCGCGG"
-input = tokenizer(text, return_tensors='pt')
 label = torch.randint(2, (len(text), len(text)))
 output = model(**input, labels=label)

 pipeline_tag: fill-mask
 mask_token: "<mask>"
 widget:
+  - example_title: "Homo sapiens PRNP mRNA for prion"
+    text: "AGC<mask>CATTATGGCGAACCTTGGCTGCTG"
     output:
+      - label: "UUN"
+        score: 0.011160684749484062
+      - label: "NGC"
+        score: 0.01067513320595026
+      - label: "NNC"
+        score: 0.010549729689955711
+      - label: "CNA"
+        score: 0.0103579331189394
       - label: "GNC"
+        score: 0.010322545655071735
 ---
 # CaLM
 - **Paper**: [Codon language embeddings provide strong signals for use in protein engineering](https://doi.org/10.1101/2022.12.15.519894)
 - **Developed by**: Carlos Outeiral, Charlotte M. Deane
 - **Model type**: [BERT](https://huggingface.co/google-bert/bert-base-uncased) - [ESM](https://huggingface.co/facebook/esm2_t48_15B_UR50D)
+- **Original Repository**: [oxpig/CaLM](https://github.com/oxpig/CaLM)
 ## Usage
 ```python
 >>> import multimolecule  # you must import multimolecule to register models
 >>> from transformers import pipeline
+>>> unmasker = pipeline("fill-mask", model="multimolecule/calm")
+>>> unmasker("agc<mask>cattatggcgaaccttggctgctg")
+[{'score': 0.011160684749484062,
+  'token': 100,
+  'token_str': 'UUN',
+  'sequence': 'AGC UUN CAU UAU GGC GAA CCU UGG CUG CUG'},
+ {'score': 0.01067513320595026,
+  'token': 117,
+  'token_str': 'NGC',
+  'sequence': 'AGC NGC CAU UAU GGC GAA CCU UGG CUG CUG'},
+ {'score': 0.010549729689955711,
+  'token': 127,
+  'token_str': 'NNC',
+  'sequence': 'AGC NNC CAU UAU GGC GAA CCU UGG CUG CUG'},
+ {'score': 0.0103579331189394,
+  'token': 51,
+  'token_str': 'CNA',
+  'sequence': 'AGC CNA CAU UAU GGC GAA CCU UGG CUG CUG'},
+ {'score': 0.010322545655071735,
   'token': 77,
   'token_str': 'GNC',
+  'sequence': 'AGC GNC CAU UAU GGC GAA CCU UGG CUG CUG'}]
 ```
 ### Downstream Use
 from multimolecule import RnaTokenizer, CaLmModel
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/calm")
+model = CaLmModel.from_pretrained("multimolecule/calm")
 text = "GCCAGTCGCTGACAGCCGCGG"
+input = tokenizer(text, return_tensors="pt")
 output = model(**input)
 ```
 from multimolecule import RnaTokenizer, CaLmForSequencePrediction
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/calm")
+model = CaLmForSequencePrediction.from_pretrained("multimolecule/calm")
 text = "GCCAGTCGCTGACAGCCGCGG"
+input = tokenizer(text, return_tensors="pt")
 label = torch.tensor([1])
 output = model(**input, labels=label)
 ```
+#### Token Classification / Regression
 **Note**: This model is not fine-tuned for any specific task. You will need to fine-tune the model on a downstream task to use it for nucleotide classification or regression.
 ```python
 import torch
+from multimolecule import RnaTokenizer, CaLmForTokenPrediction
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/calm")
+model = CaLmForTokenPrediction.from_pretrained("multimolecule/calm")
 text = "GCCAGTCGCTGACAGCCGCGG"
+input = tokenizer(text, return_tensors="pt")
 label = torch.randint(2, (len(text), ))
 output = model(**input, labels=label)
 from multimolecule import RnaTokenizer, CaLmForContactPrediction
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/calm")
+model = CaLmForContactPrediction.from_pretrained("multimolecule/calm")
 text = "GCCAGTCGCTGACAGCCGCGG"
+input = tokenizer(text, return_tensors="pt")
 label = torch.randint(2, (len(text), len(text)))
 output = model(**input, labels=label)