lixiangchun commited on
Commit
70cd2c0
1 Parent(s): dc0bd2a

update README

Browse files
Files changed (1) hide show
  1. README.md +32 -3
README.md CHANGED
@@ -2,9 +2,10 @@
2
  # iSEEEK
3
  A universal approach for integrating super large-scale single-cell transcriptomes by exploring gene rankings
4
 
5
- ```python
6
  ## An simple pipeline for single-cell analysis
 
7
  import torch
 
8
  import re
9
  from tqdm import tqdm
10
  import numpy as np
@@ -31,8 +32,8 @@ model.eval()
31
 
32
 
33
  ## Data desposited in https://huggingface.co/TJMUCH/transcriptome-iseeek/tree/main
34
- lines = [s.strip() for s in gzip.open("pbmc_ranking.txt.gz")]
35
- labels = [s.strip() for s in gzip.open("pbmc_label.txt.gz")]
36
  labels = np.asarray(labels)
37
 
38
 
@@ -66,3 +67,31 @@ sc.pl.umap(adata, color=['celltype','leiden'],save= "UMAP")
66
 
67
  ```
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  # iSEEEK
3
  A universal approach for integrating super large-scale single-cell transcriptomes by exploring gene rankings
4
 
 
5
  ## An simple pipeline for single-cell analysis
6
+ ```python
7
  import torch
8
+ import gzip
9
  import re
10
  from tqdm import tqdm
11
  import numpy as np
 
32
 
33
 
34
  ## Data desposited in https://huggingface.co/TJMUCH/transcriptome-iseeek/tree/main
35
+ lines = [s.strip().decode() for s in gzip.open("pbmc_ranking.txt.gz")]
36
+ labels = [s.strip().decode() for s in gzip.open("pbmc_label.txt.gz")]
37
  labels = np.asarray(labels)
38
 
39
 
 
67
 
68
  ```
69
 
70
+ ## Extract token representations
71
+ ```python
72
+
73
+ cell_counts = len(lines)
74
+ x = np.zeros((cell_counts, len(tokenizer)), dtype=np.float16)
75
+
76
+ for a in tqdm(dl, total=len(dl)):
77
+ batch = tokenizer(a, max_length=128, truncation=True,
78
+ padding=True, return_tensors="pt")
79
+
80
+ for k, v in batch.items():
81
+ batch[k] = v.to(device)
82
+
83
+ with torch.no_grad():
84
+ out = model(**batch)
85
+
86
+ eos_idxs = batch.attention_mask.sum(dim=1) - 1
87
+ f = out.last_hidden_state
88
+ batch_size = f.shape[0]
89
+ input_ids = batch.input_ids
90
+
91
+ for i in range(batch_size):
92
+ ##genes = tokenizer.batch_decode(input_ids[i])
93
+ token_norms = [f[i][j].norm().item() for j in range(1, eos_idxs[i])]
94
+ idxs = input_ids[i].tolist()[1:eos_idxs[i]]
95
+ x[counter, idxs] = token_norms
96
+ counter = counter + 1
97
+ ```