ZhiyuanChen
commited on
Commit
•
075fa1f
1
Parent(s):
b641ec4
Update README.md
Browse files
README.md
CHANGED
@@ -10,19 +10,19 @@ library_name: multimolecule
|
|
10 |
pipeline_tag: fill-mask
|
11 |
mask_token: "<mask>"
|
12 |
widget:
|
13 |
-
- example_title: "PRNP"
|
14 |
-
text: "
|
15 |
output:
|
16 |
-
- label: "
|
17 |
-
score: 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
- label: "GNC"
|
19 |
-
score: 0.
|
20 |
-
- label: "AUC"
|
21 |
-
score: 0.010415051132440567
|
22 |
-
- label: "GGG"
|
23 |
-
score: 0.010389575734734535
|
24 |
-
- label: "AAU"
|
25 |
-
score: 0.01017767284065485
|
26 |
---
|
27 |
|
28 |
# CaLM
|
@@ -75,7 +75,7 @@ CaLM is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-style mod
|
|
75 |
- **Paper**: [Codon language embeddings provide strong signals for use in protein engineering](https://doi.org/10.1101/2022.12.15.519894)
|
76 |
- **Developed by**: Carlos Outeiral, Charlotte M. Deane
|
77 |
- **Model type**: [BERT](https://huggingface.co/google-bert/bert-base-uncased) - [ESM](https://huggingface.co/facebook/esm2_t48_15B_UR50D)
|
78 |
-
- **Original Repository**: [
|
79 |
|
80 |
## Usage
|
81 |
|
@@ -92,29 +92,29 @@ You can use this model directly with a pipeline for masked language modeling:
|
|
92 |
```python
|
93 |
>>> import multimolecule # you must import multimolecule to register models
|
94 |
>>> from transformers import pipeline
|
95 |
-
>>> unmasker = pipeline(
|
96 |
-
>>> unmasker("
|
97 |
-
|
98 |
-
[{'score': 0.
|
99 |
-
'token':
|
100 |
-
'token_str': '
|
101 |
-
'sequence': '
|
102 |
-
{'score': 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
'token': 77,
|
104 |
'token_str': 'GNC',
|
105 |
-
'sequence': '
|
106 |
-
{'score': 0.010415051132440567,
|
107 |
-
'token': 22,
|
108 |
-
'token_str': 'AUC',
|
109 |
-
'sequence': 'CUG AUC AAG CGG CCC ACG CGG ACU GAC GGG CGG GGG'},
|
110 |
-
{'score': 0.010389575734734535,
|
111 |
-
'token': 68,
|
112 |
-
'token_str': 'GGG',
|
113 |
-
'sequence': 'CUG GGG AAG CGG CCC ACG CGG ACU GAC GGG CGG GGG'},
|
114 |
-
{'score': 0.01017767284065485,
|
115 |
-
'token': 9,
|
116 |
-
'token_str': 'AAU',
|
117 |
-
'sequence': 'CUG AAU AAG CGG CCC ACG CGG ACU GAC GGG CGG GGG'}]
|
118 |
```
|
119 |
|
120 |
### Downstream Use
|
@@ -127,11 +127,11 @@ Here is how to use this model to get the features of a given sequence in PyTorch
|
|
127 |
from multimolecule import RnaTokenizer, CaLmModel
|
128 |
|
129 |
|
130 |
-
tokenizer = RnaTokenizer.from_pretrained(
|
131 |
-
model = CaLmModel.from_pretrained(
|
132 |
|
133 |
text = "GCCAGTCGCTGACAGCCGCGG"
|
134 |
-
input = tokenizer(text, return_tensors=
|
135 |
|
136 |
output = model(**input)
|
137 |
```
|
@@ -147,17 +147,17 @@ import torch
|
|
147 |
from multimolecule import RnaTokenizer, CaLmForSequencePrediction
|
148 |
|
149 |
|
150 |
-
tokenizer = RnaTokenizer.from_pretrained(
|
151 |
-
model = CaLmForSequencePrediction.from_pretrained(
|
152 |
|
153 |
text = "GCCAGTCGCTGACAGCCGCGG"
|
154 |
-
input = tokenizer(text, return_tensors=
|
155 |
label = torch.tensor([1])
|
156 |
|
157 |
output = model(**input, labels=label)
|
158 |
```
|
159 |
|
160 |
-
####
|
161 |
|
162 |
**Note**: This model is not fine-tuned for any specific task. You will need to fine-tune the model on a downstream task to use it for nucleotide classification or regression.
|
163 |
|
@@ -165,14 +165,14 @@ Here is how to use this model as backbone to fine-tune for a nucleotide-level ta
|
|
165 |
|
166 |
```python
|
167 |
import torch
|
168 |
-
from multimolecule import RnaTokenizer,
|
169 |
|
170 |
|
171 |
-
tokenizer = RnaTokenizer.from_pretrained(
|
172 |
-
model =
|
173 |
|
174 |
text = "GCCAGTCGCTGACAGCCGCGG"
|
175 |
-
input = tokenizer(text, return_tensors=
|
176 |
label = torch.randint(2, (len(text), ))
|
177 |
|
178 |
output = model(**input, labels=label)
|
@@ -189,11 +189,11 @@ import torch
|
|
189 |
from multimolecule import RnaTokenizer, CaLmForContactPrediction
|
190 |
|
191 |
|
192 |
-
tokenizer = RnaTokenizer.from_pretrained(
|
193 |
-
model = CaLmForContactPrediction.from_pretrained(
|
194 |
|
195 |
text = "GCCAGTCGCTGACAGCCGCGG"
|
196 |
-
input = tokenizer(text, return_tensors=
|
197 |
label = torch.randint(2, (len(text), len(text)))
|
198 |
|
199 |
output = model(**input, labels=label)
|
|
|
10 |
pipeline_tag: fill-mask
|
11 |
mask_token: "<mask>"
|
12 |
widget:
|
13 |
+
- example_title: "Homo sapiens PRNP mRNA for prion"
|
14 |
+
text: "AGC<mask>CATTATGGCGAACCTTGGCTGCTG"
|
15 |
output:
|
16 |
+
- label: "UUN"
|
17 |
+
score: 0.011160684749484062
|
18 |
+
- label: "NGC"
|
19 |
+
score: 0.01067513320595026
|
20 |
+
- label: "NNC"
|
21 |
+
score: 0.010549729689955711
|
22 |
+
- label: "CNA"
|
23 |
+
score: 0.0103579331189394
|
24 |
- label: "GNC"
|
25 |
+
score: 0.010322545655071735
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
---
|
27 |
|
28 |
# CaLM
|
|
|
75 |
- **Paper**: [Codon language embeddings provide strong signals for use in protein engineering](https://doi.org/10.1101/2022.12.15.519894)
|
76 |
- **Developed by**: Carlos Outeiral, Charlotte M. Deane
|
77 |
- **Model type**: [BERT](https://huggingface.co/google-bert/bert-base-uncased) - [ESM](https://huggingface.co/facebook/esm2_t48_15B_UR50D)
|
78 |
+
- **Original Repository**: [oxpig/CaLM](https://github.com/oxpig/CaLM)
|
79 |
|
80 |
## Usage
|
81 |
|
|
|
92 |
```python
|
93 |
>>> import multimolecule # you must import multimolecule to register models
|
94 |
>>> from transformers import pipeline
|
95 |
+
>>> unmasker = pipeline("fill-mask", model="multimolecule/calm")
|
96 |
+
>>> unmasker("agc<mask>cattatggcgaaccttggctgctg")
|
97 |
+
|
98 |
+
[{'score': 0.011160684749484062,
|
99 |
+
'token': 100,
|
100 |
+
'token_str': 'UUN',
|
101 |
+
'sequence': 'AGC UUN CAU UAU GGC GAA CCU UGG CUG CUG'},
|
102 |
+
{'score': 0.01067513320595026,
|
103 |
+
'token': 117,
|
104 |
+
'token_str': 'NGC',
|
105 |
+
'sequence': 'AGC NGC CAU UAU GGC GAA CCU UGG CUG CUG'},
|
106 |
+
{'score': 0.010549729689955711,
|
107 |
+
'token': 127,
|
108 |
+
'token_str': 'NNC',
|
109 |
+
'sequence': 'AGC NNC CAU UAU GGC GAA CCU UGG CUG CUG'},
|
110 |
+
{'score': 0.0103579331189394,
|
111 |
+
'token': 51,
|
112 |
+
'token_str': 'CNA',
|
113 |
+
'sequence': 'AGC CNA CAU UAU GGC GAA CCU UGG CUG CUG'},
|
114 |
+
{'score': 0.010322545655071735,
|
115 |
'token': 77,
|
116 |
'token_str': 'GNC',
|
117 |
+
'sequence': 'AGC GNC CAU UAU GGC GAA CCU UGG CUG CUG'}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
```
|
119 |
|
120 |
### Downstream Use
|
|
|
127 |
from multimolecule import RnaTokenizer, CaLmModel
|
128 |
|
129 |
|
130 |
+
tokenizer = RnaTokenizer.from_pretrained("multimolecule/calm")
|
131 |
+
model = CaLmModel.from_pretrained("multimolecule/calm")
|
132 |
|
133 |
text = "GCCAGTCGCTGACAGCCGCGG"
|
134 |
+
input = tokenizer(text, return_tensors="pt")
|
135 |
|
136 |
output = model(**input)
|
137 |
```
|
|
|
147 |
from multimolecule import RnaTokenizer, CaLmForSequencePrediction
|
148 |
|
149 |
|
150 |
+
tokenizer = RnaTokenizer.from_pretrained("multimolecule/calm")
|
151 |
+
model = CaLmForSequencePrediction.from_pretrained("multimolecule/calm")
|
152 |
|
153 |
text = "GCCAGTCGCTGACAGCCGCGG"
|
154 |
+
input = tokenizer(text, return_tensors="pt")
|
155 |
label = torch.tensor([1])
|
156 |
|
157 |
output = model(**input, labels=label)
|
158 |
```
|
159 |
|
160 |
+
#### Token Classification / Regression
|
161 |
|
162 |
**Note**: This model is not fine-tuned for any specific task. You will need to fine-tune the model on a downstream task to use it for nucleotide classification or regression.
|
163 |
|
|
|
165 |
|
166 |
```python
|
167 |
import torch
|
168 |
+
from multimolecule import RnaTokenizer, CaLmForTokenPrediction
|
169 |
|
170 |
|
171 |
+
tokenizer = RnaTokenizer.from_pretrained("multimolecule/calm")
|
172 |
+
model = CaLmForTokenPrediction.from_pretrained("multimolecule/calm")
|
173 |
|
174 |
text = "GCCAGTCGCTGACAGCCGCGG"
|
175 |
+
input = tokenizer(text, return_tensors="pt")
|
176 |
label = torch.randint(2, (len(text), ))
|
177 |
|
178 |
output = model(**input, labels=label)
|
|
|
189 |
from multimolecule import RnaTokenizer, CaLmForContactPrediction
|
190 |
|
191 |
|
192 |
+
tokenizer = RnaTokenizer.from_pretrained("multimolecule/calm")
|
193 |
+
model = CaLmForContactPrediction.from_pretrained("multimolecule/calm")
|
194 |
|
195 |
text = "GCCAGTCGCTGACAGCCGCGG"
|
196 |
+
input = tokenizer(text, return_tensors="pt")
|
197 |
label = torch.randint(2, (len(text), len(text)))
|
198 |
|
199 |
output = model(**input, labels=label)
|