Update README.md
Browse files
README.md
CHANGED
@@ -11,16 +11,18 @@ The model can be used for feature extractions in protein function prediction tas
|
|
11 |
```python
|
12 |
from transformers import AutoTokenizer, AutoModel
|
13 |
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
|
14 |
-
model = AutoModel.from_pretrained("h4duan/PAIR-esm2")
|
15 |
protein = ["AETCZAO"]
|
16 |
|
17 |
def extract_feature(protein):
|
18 |
ids = tokenizer(protein, return_tensors="pt", padding=True, max_length=1024, truncation=True, return_attention_mask=True)
|
19 |
-
input_ids = torch.tensor(ids['input_ids']).to(
|
20 |
-
attention_mask = torch.tensor(ids['attention_mask']).to(
|
21 |
with torch.no_grad():
|
22 |
embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask).last_hidden_state
|
23 |
-
return torch.mean(embedding_repr)
|
|
|
|
|
24 |
```
|
25 |
|
26 |
<h1>How to extract the features in batch?</h1>
|
@@ -29,8 +31,8 @@ def extract_feature(protein):
|
|
29 |
proteins = ["AETCZAO","SKTZP"]
|
30 |
def extract_features_batch(proteins):
|
31 |
ids = tokenizer(proteins, return_tensors="pt", padding=True, max_length=1024, truncation=True, return_attention_mask=True)
|
32 |
-
input_ids = torch.tensor(ids['input_ids']).to(
|
33 |
-
attention_mask = torch.tensor(ids['attention_mask']).to(
|
34 |
with torch.no_grad():
|
35 |
embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask).last_hidden_state
|
36 |
attention_mask = attention_mask.unsqueeze(-1)
|
@@ -40,4 +42,6 @@ def extract_features_batch(proteins):
|
|
40 |
non_zero_count = attention_mask.sum(dim=1)
|
41 |
mean_embedding_repr = sum_embedding_repr / non_zero_count
|
42 |
return mean_embedding_repr
|
|
|
|
|
43 |
```
|
|
|
11 |
```python
|
12 |
from transformers import AutoTokenizer, AutoModel
|
13 |
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
|
14 |
+
model = AutoModel.from_pretrained("h4duan/PAIR-esm2").to("cuda")
|
15 |
protein = ["AETCZAO"]
|
16 |
|
17 |
def extract_feature(protein):
|
18 |
ids = tokenizer(protein, return_tensors="pt", padding=True, max_length=1024, truncation=True, return_attention_mask=True)
|
19 |
+
input_ids = torch.tensor(ids['input_ids']).to("cuda")
|
20 |
+
attention_mask = torch.tensor(ids['attention_mask']).to("cuda")
|
21 |
with torch.no_grad():
|
22 |
embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask).last_hidden_state
|
23 |
+
return torch.mean(embedding_repr, dim=1)
|
24 |
+
|
25 |
+
feature = extract_feature(protein)
|
26 |
```
|
27 |
|
28 |
<h1>How to extract the features in batch?</h1>
|
|
|
31 |
proteins = ["AETCZAO","SKTZP"]
|
32 |
def extract_features_batch(proteins):
|
33 |
ids = tokenizer(proteins, return_tensors="pt", padding=True, max_length=1024, truncation=True, return_attention_mask=True)
|
34 |
+
input_ids = torch.tensor(ids['input_ids']).to("cuda")
|
35 |
+
attention_mask = torch.tensor(ids['attention_mask']).to("cuda")
|
36 |
with torch.no_grad():
|
37 |
embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask).last_hidden_state
|
38 |
attention_mask = attention_mask.unsqueeze(-1)
|
|
|
42 |
non_zero_count = attention_mask.sum(dim=1)
|
43 |
mean_embedding_repr = sum_embedding_repr / non_zero_count
|
44 |
return mean_embedding_repr
|
45 |
+
|
46 |
+
feature = extract_features_batch(proteins)
|
47 |
```
|