ZhiyuanChen
commited on
Commit
•
82173e0
1
Parent(s):
9d75aba
Update README.md
Browse files
README.md
CHANGED
@@ -13,6 +13,19 @@ library_name: multimolecule
|
|
13 |
pipeline_tag: fill-mask
|
14 |
mask_token: "<mask>"
|
15 |
widget:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
- example_title: "microRNA-21"
|
17 |
text: "UAGC<mask>UAUCAGACUGAUGUUGA"
|
18 |
output:
|
@@ -61,7 +74,7 @@ RiNALMo is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-style
|
|
61 |
- **Paper**: [RiNALMo: General-Purpose RNA Language Models Can Generalize Well on Structure Prediction Tasks](https://doi.org/10.48550/arXiv.2403.00043)
|
62 |
- **Developed by**: Rafael Josip Penić, Tin Vlašić, Roland G. Huber, Yue Wan, Mile Šikić
|
63 |
- **Model type**: [BERT](https://huggingface.co/google-bert/bert-base-uncased)
|
64 |
-
- **Original Repository**: [
|
65 |
|
66 |
## Usage
|
67 |
|
@@ -78,29 +91,29 @@ You can use this model directly with a pipeline for masked language modeling:
|
|
78 |
```python
|
79 |
>>> import multimolecule # you must import multimolecule to register models
|
80 |
>>> from transformers import pipeline
|
81 |
-
>>> unmasker = pipeline(
|
82 |
-
>>> unmasker("
|
83 |
|
84 |
-
[{'score': 0.
|
85 |
'token': 6,
|
86 |
'token_str': 'A',
|
87 |
-
'sequence': '
|
88 |
-
{'score': 0.
|
89 |
'token': 9,
|
90 |
'token_str': 'U',
|
91 |
-
'sequence': 'U
|
92 |
-
{'score': 0.
|
93 |
'token': 22,
|
94 |
'token_str': 'X',
|
95 |
-
'sequence': '
|
96 |
-
{'score': 0.
|
97 |
'token': 7,
|
98 |
'token_str': 'C',
|
99 |
-
'sequence': 'U
|
100 |
-
{'score': 0.
|
101 |
'token': 8,
|
102 |
'token_str': 'G',
|
103 |
-
'sequence': 'U
|
104 |
```
|
105 |
|
106 |
### Downstream Use
|
@@ -113,11 +126,11 @@ Here is how to use this model to get the features of a given sequence in PyTorch
|
|
113 |
from multimolecule import RnaTokenizer, RiNALMoModel
|
114 |
|
115 |
|
116 |
-
tokenizer = RnaTokenizer.from_pretrained(
|
117 |
-
model = RiNALMoModel.from_pretrained(
|
118 |
|
119 |
text = "UAGCUUAUCAGACUGAUGUUGA"
|
120 |
-
input = tokenizer(text, return_tensors=
|
121 |
|
122 |
output = model(**input)
|
123 |
```
|
@@ -133,17 +146,17 @@ import torch
|
|
133 |
from multimolecule import RnaTokenizer, RiNALMoForSequencePrediction
|
134 |
|
135 |
|
136 |
-
tokenizer = RnaTokenizer.from_pretrained(
|
137 |
-
model = RiNALMoForSequencePrediction.from_pretrained(
|
138 |
|
139 |
text = "UAGCUUAUCAGACUGAUGUUGA"
|
140 |
-
input = tokenizer(text, return_tensors=
|
141 |
label = torch.tensor([1])
|
142 |
|
143 |
output = model(**input, labels=label)
|
144 |
```
|
145 |
|
146 |
-
####
|
147 |
|
148 |
**Note**: This model is not fine-tuned for any specific task. You will need to fine-tune the model on a downstream task to use it for nucleotide classification or regression.
|
149 |
|
@@ -151,14 +164,14 @@ Here is how to use this model as backbone to fine-tune for a nucleotide-level ta
|
|
151 |
|
152 |
```python
|
153 |
import torch
|
154 |
-
from multimolecule import RnaTokenizer,
|
155 |
|
156 |
|
157 |
-
tokenizer = RnaTokenizer.from_pretrained(
|
158 |
-
model =
|
159 |
|
160 |
text = "UAGCUUAUCAGACUGAUGUUGA"
|
161 |
-
input = tokenizer(text, return_tensors=
|
162 |
label = torch.randint(2, (len(text), ))
|
163 |
|
164 |
output = model(**input, labels=label)
|
@@ -175,11 +188,11 @@ import torch
|
|
175 |
from multimolecule import RnaTokenizer, RiNALMoForContactPrediction
|
176 |
|
177 |
|
178 |
-
tokenizer = RnaTokenizer.from_pretrained(
|
179 |
-
model = RiNALMoForContactPrediction.from_pretrained(
|
180 |
|
181 |
text = "UAGCUUAUCAGACUGAUGUUGA"
|
182 |
-
input = tokenizer(text, return_tensors=
|
183 |
label = torch.randint(2, (len(text), len(text)))
|
184 |
|
185 |
output = model(**input, labels=label)
|
|
|
13 |
pipeline_tag: fill-mask
|
14 |
mask_token: "<mask>"
|
15 |
widget:
|
16 |
+
- example_title: "HIV-1"
|
17 |
+
text: "GGUC<mask>CUCUGGUUAGACCAGAUCUGAGCCU"
|
18 |
+
output:
|
19 |
+
- label: "A"
|
20 |
+
score: 0.3932918310165405
|
21 |
+
- label: "U"
|
22 |
+
score: 0.2897723913192749
|
23 |
+
- label: "X"
|
24 |
+
score: 0.15423105657100677
|
25 |
+
- label: "C"
|
26 |
+
score: 0.12160095572471619
|
27 |
+
- label: "G"
|
28 |
+
score: 0.0408296100795269
|
29 |
- example_title: "microRNA-21"
|
30 |
text: "UAGC<mask>UAUCAGACUGAUGUUGA"
|
31 |
output:
|
|
|
74 |
- **Paper**: [RiNALMo: General-Purpose RNA Language Models Can Generalize Well on Structure Prediction Tasks](https://doi.org/10.48550/arXiv.2403.00043)
|
75 |
- **Developed by**: Rafael Josip Penić, Tin Vlašić, Roland G. Huber, Yue Wan, Mile Šikić
|
76 |
- **Model type**: [BERT](https://huggingface.co/google-bert/bert-base-uncased)
|
77 |
+
- **Original Repository**: [lbcb-sci/RiNALMo](https://github.com/lbcb-sci/RiNALMo)
|
78 |
|
79 |
## Usage
|
80 |
|
|
|
91 |
```python
|
92 |
>>> import multimolecule # you must import multimolecule to register models
|
93 |
>>> from transformers import pipeline
|
94 |
+
>>> unmasker = pipeline("fill-mask", model="multimolecule/rinalmo")
|
95 |
+
>>> unmasker("gguc<mask>cucugguuagaccagaucugagccu")
|
96 |
|
97 |
+
[{'score': 0.3932918310165405,
|
98 |
'token': 6,
|
99 |
'token_str': 'A',
|
100 |
+
'sequence': 'G G U C A C U C U G G U U A G A C C A G A U C U G A G C C U'},
|
101 |
+
{'score': 0.2897723913192749,
|
102 |
'token': 9,
|
103 |
'token_str': 'U',
|
104 |
+
'sequence': 'G G U C U C U C U G G U U A G A C C A G A U C U G A G C C U'},
|
105 |
+
{'score': 0.15423105657100677,
|
106 |
'token': 22,
|
107 |
'token_str': 'X',
|
108 |
+
'sequence': 'G G U C X C U C U G G U U A G A C C A G A U C U G A G C C U'},
|
109 |
+
{'score': 0.12160095572471619,
|
110 |
'token': 7,
|
111 |
'token_str': 'C',
|
112 |
+
'sequence': 'G G U C C C U C U G G U U A G A C C A G A U C U G A G C C U'},
|
113 |
+
{'score': 0.0408296100795269,
|
114 |
'token': 8,
|
115 |
'token_str': 'G',
|
116 |
+
'sequence': 'G G U C G C U C U G G U U A G A C C A G A U C U G A G C C U'}]
|
117 |
```
|
118 |
|
119 |
### Downstream Use
|
|
|
126 |
from multimolecule import RnaTokenizer, RiNALMoModel
|
127 |
|
128 |
|
129 |
+
tokenizer = RnaTokenizer.from_pretrained("multimolecule/rinalmo")
|
130 |
+
model = RiNALMoModel.from_pretrained("multimolecule/rinalmo")
|
131 |
|
132 |
text = "UAGCUUAUCAGACUGAUGUUGA"
|
133 |
+
input = tokenizer(text, return_tensors="pt")
|
134 |
|
135 |
output = model(**input)
|
136 |
```
|
|
|
146 |
from multimolecule import RnaTokenizer, RiNALMoForSequencePrediction
|
147 |
|
148 |
|
149 |
+
tokenizer = RnaTokenizer.from_pretrained("multimolecule/rinalmo")
|
150 |
+
model = RiNALMoForSequencePrediction.from_pretrained("multimolecule/rinalmo")
|
151 |
|
152 |
text = "UAGCUUAUCAGACUGAUGUUGA"
|
153 |
+
input = tokenizer(text, return_tensors="pt")
|
154 |
label = torch.tensor([1])
|
155 |
|
156 |
output = model(**input, labels=label)
|
157 |
```
|
158 |
|
159 |
+
#### Token Classification / Regression
|
160 |
|
161 |
**Note**: This model is not fine-tuned for any specific task. You will need to fine-tune the model on a downstream task to use it for nucleotide classification or regression.
|
162 |
|
|
|
164 |
|
165 |
```python
|
166 |
import torch
|
167 |
+
from multimolecule import RnaTokenizer, RiNALMoForTokenPrediction
|
168 |
|
169 |
|
170 |
+
tokenizer = RnaTokenizer.from_pretrained("multimolecule/rinalmo")
|
171 |
+
model = RiNALMoForTokenPrediction.from_pretrained("multimolecule/rinalmo")
|
172 |
|
173 |
text = "UAGCUUAUCAGACUGAUGUUGA"
|
174 |
+
input = tokenizer(text, return_tensors="pt")
|
175 |
label = torch.randint(2, (len(text), ))
|
176 |
|
177 |
output = model(**input, labels=label)
|
|
|
188 |
from multimolecule import RnaTokenizer, RiNALMoForContactPrediction
|
189 |
|
190 |
|
191 |
+
tokenizer = RnaTokenizer.from_pretrained("multimolecule/rinalmo")
|
192 |
+
model = RiNALMoForContactPrediction.from_pretrained("multimolecule/rinalmo")
|
193 |
|
194 |
text = "UAGCUUAUCAGACUGAUGUUGA"
|
195 |
+
input = tokenizer(text, return_tensors="pt")
|
196 |
label = torch.randint(2, (len(text), len(text)))
|
197 |
|
198 |
output = model(**input, labels=label)
|