lixiangchun commited on
Commit
a3b02b5
1 Parent(s): 794c624

update README

Browse files
Files changed (1) hide show
  1. README.md +55 -9
README.md CHANGED
@@ -1,22 +1,68 @@
1
 
 
 
2
 
3
  ```python
4
- from transformers import PreTrainedTokenizerFast, BertForMaskedLM
 
5
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  tokenizer = PreTrainedTokenizerFast.from_pretrained("lixiangchun/transcriptome_iseeek_13millioncells_128tokens")
8
- iseeek = BertForMaskedLM.from_pretrained("lixiangchun/transcriptome_iseeek_13millioncells_128tokens")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- a = ["B2M MTRNR2L8 UBC FOS TMSB4X UBB FTH1 IFITM1 TPT1 FTL DUSP1", "KRT14 MTRNR2L8 KRT6A B2M GAPDH S100A8 S100A9 KRT5"]
 
11
 
12
- # Replace '-' and '.' with '_'
13
- a = [re.sub(r'\-|\.', '_', s) for s in a]
14
 
15
- batch = tokenizer(a, max_length=128, truncation=True, padding=True, return_tensors="pt")
16
- out = iseeek.bert(**batch)
17
 
18
- # [CLS] representation
19
- feature = out.last_hidden_state[:,0,:]
 
 
 
 
 
20
 
21
  ```
22
 
 
1
 
2
+ # iSEEEK
3
+ A universal approach for integrating super large-scale single-cell transcriptomes by exploring gene rankings
4
 
5
  ```python
6
+ ## An simple pipeline for single-cell analysis
7
+ import torch
8
  import re
9
+ from tqdm import tqdm
10
+ import numpy as np
11
+ import scanpy as sc
12
+ from torch.utils.data import DataLoader, Dataset
13
+ from transformers import PreTrainedTokenizerFast, BertForMaskedLM
14
+
15
+ class LineDataset(Dataset):
16
+ def __init__(self, lines):
17
+ self.lines = lines
18
+ self.regex = re.compile(r'\-|\.')
19
+ def __getitem__(self, i):
20
+ return self.regex.sub('_', self.lines[i])
21
+ def __len__(self):
22
+ return len(self.lines)
23
+
24
+ device = "cuda" if torch.cuda.is_available() else "cpu"
25
+ torch.set_num_threads(2)
26
 
27
  tokenizer = PreTrainedTokenizerFast.from_pretrained("lixiangchun/transcriptome_iseeek_13millioncells_128tokens")
28
+ model = BertForMaskedLM.from_pretrained("lixiangchun/transcriptome_iseeek_13millioncells_128tokens").bert
29
+ model = model.to(device)
30
+ model.eval()
31
+
32
+
33
+ text_file = "/mnt/ssd2/shenhr/BERT/bert_256/pbmc/deal/gene_rank_pmbc.txt"
34
+ labels = [s.strip() for s in open('/mnt/ssd2/shenhr/BERT/bert_256/pbmc/deal/labels.txt')]
35
+ labels = np.asarray(labels)
36
+
37
+ lines = [s.strip() for s in open(text_file)]
38
+
39
+ ds = LineDataset(lines)
40
+ dl = DataLoader(ds, batch_size=80)
41
+
42
+ features = []
43
+
44
+ for a in tqdm(dl, total=len(dl)):
45
+ batch = tokenizer(a, max_length=128, truncation=True,
46
+ padding=True, return_tensors="pt")
47
+
48
+ for k, v in batch.items():
49
+ batch[k] = v.to(device)
50
 
51
+ with torch.no_grad():
52
+ out = model(**batch)
53
 
54
+ f = out.last_hidden_state[:,0,:]
55
+ features.extend(f.tolist())
56
 
57
+ features = np.stack(features)
 
58
 
59
+ adata = sc.AnnData(features)
60
+ adata.obs['celltype'] = labels
61
+ adata.obs.celltype = adata.obs.celltype.astype("category")
62
+ sc.pp.neighbors(adata, use_rep='X')
63
+ sc.tl.umap(adata)
64
+ sc.tl.leiden(adata)
65
+ sc.pl.umap(adata, color=['celltype','leiden'],save= "UMAP")
66
 
67
  ```
68