anndata tokenizer

#102
by giovp - opened
Files changed (1) hide show
  1. geneformer/tokenizer.py +95 -12
geneformer/tokenizer.py CHANGED
@@ -14,6 +14,9 @@ Usage:
14
  tk.tokenize_data("loom_data_directory", "output_directory", "output_prefix")
15
  """
16
 
 
 
 
17
  import pickle
18
  from pathlib import Path
19
 
@@ -85,42 +88,119 @@ class TranscriptomeTokenizer:
85
  # protein-coding and miRNA gene list dictionary for selecting .loom rows for tokenization
86
  self.genelist_dict = dict(zip(self.gene_keys, [True] * len(self.gene_keys)))
87
 
88
- def tokenize_data(self, loom_data_directory, output_directory, output_prefix):
 
 
 
 
 
 
89
  """
90
  Tokenize .loom files in loom_data_directory and save as tokenized .dataset in output_directory.
91
 
92
  Parameters
93
  ----------
94
  loom_data_directory : Path
95
- Path to directory containing loom files
96
  output_directory : Path
97
  Path to directory where tokenized data will be saved as .dataset
98
  output_prefix : str
99
  Prefix for output .dataset
 
 
100
  """
101
- tokenized_cells, cell_metadata = self.tokenize_files(Path(loom_data_directory))
 
 
102
  tokenized_dataset = self.create_dataset(tokenized_cells, cell_metadata)
103
 
104
  output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset")
105
  tokenized_dataset.save_to_disk(output_path)
106
 
107
- def tokenize_files(self, loom_data_directory):
 
 
108
  tokenized_cells = []
109
  loom_cell_attr = [attr_key for attr_key in self.custom_attr_name_dict.keys()]
110
- cell_metadata = {attr_key: [] for attr_key in self.custom_attr_name_dict.values()}
 
 
111
 
112
- # loops through directories to tokenize .loom files
113
- for loom_file_path in loom_data_directory.glob("*.loom"):
114
- print(f"Tokenizing {loom_file_path}")
115
- file_tokenized_cells, file_cell_metadata = self.tokenize_file(
116
- loom_file_path
117
- )
 
118
  tokenized_cells += file_tokenized_cells
119
  for k in loom_cell_attr:
120
  cell_metadata[self.custom_attr_name_dict[k]] += file_cell_metadata[k]
121
 
122
  return tokenized_cells, cell_metadata
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  def tokenize_file(self, loom_file_path):
125
  file_cell_metadata = {
126
  attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
@@ -162,7 +242,9 @@ class TranscriptomeTokenizer:
162
 
163
  # scan through .loom files and tokenize cells
164
  tokenized_cells = []
165
- for (_ix, _selection, view) in data.scan(items=filter_pass_loc, axis=1):
 
 
166
  # select subview with protein-coding and miRNA genes
167
  subview = view.view[coding_miRNA_loc, :]
168
 
@@ -174,6 +256,7 @@ class TranscriptomeTokenizer:
174
  * 10_000
175
  / norm_factor_vector[:, None]
176
  )
 
177
  # tokenize subview gene vectors
178
  tokenized_cells += [
179
  tokenize_cell(subview_norm_array[:, i], coding_miRNA_tokens)
 
14
  tk.tokenize_data("loom_data_directory", "output_directory", "output_prefix")
15
  """
16
 
17
+
18
+ from __future__ import annotations
19
+ from typing import Literal
20
  import pickle
21
  from pathlib import Path
22
 
 
88
  # protein-coding and miRNA gene list dictionary for selecting .loom rows for tokenization
89
  self.genelist_dict = dict(zip(self.gene_keys, [True] * len(self.gene_keys)))
90
 
91
+ def tokenize_data(
92
+ self,
93
+ data_directory: Path | str,
94
+ output_directory: Path | str,
95
+ output_prefix: str,
96
+ file_format: Literal["loom", "h5ad"] = "loom",
97
+ ):
98
  """
99
  Tokenize .loom files in loom_data_directory and save as tokenized .dataset in output_directory.
100
 
101
  Parameters
102
  ----------
103
  loom_data_directory : Path
104
+ Path to directory containing loom files or anndata files
105
  output_directory : Path
106
  Path to directory where tokenized data will be saved as .dataset
107
  output_prefix : str
108
  Prefix for output .dataset
109
+ file_format : str
110
+ Format of input files. Can be "loom" or "h5ad".
111
  """
112
+ tokenized_cells, cell_metadata = self.tokenize_files(
113
+ Path(data_directory), file_format
114
+ )
115
  tokenized_dataset = self.create_dataset(tokenized_cells, cell_metadata)
116
 
117
  output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset")
118
  tokenized_dataset.save_to_disk(output_path)
119
 
120
+ def tokenize_files(
121
+ self, data_directory, file_format: Literal["loom", "h5ad"] = "loom"
122
+ ):
123
  tokenized_cells = []
124
  loom_cell_attr = [attr_key for attr_key in self.custom_attr_name_dict.keys()]
125
+ cell_metadata = {
126
+ attr_key: [] for attr_key in self.custom_attr_name_dict.values()
127
+ }
128
 
129
+ # loops through directories to tokenize .loom or .h5ad files
130
+ tokenize_file_fn = (
131
+ self.tokenize_file if file_format == "loom" else self.tokenize_anndata
132
+ )
133
+ for file_path in data_directory.glob("*.{}".format(file_format)):
134
+ print(f"Tokenizing {file_path}")
135
+ file_tokenized_cells, file_cell_metadata = tokenize_file_fn(file_path)
136
  tokenized_cells += file_tokenized_cells
137
  for k in loom_cell_attr:
138
  cell_metadata[self.custom_attr_name_dict[k]] += file_cell_metadata[k]
139
 
140
  return tokenized_cells, cell_metadata
141
 
142
+ def tokenize_anndata(self, adata_file_path):
143
+ import anndata as ad
144
+
145
+ adata = ad.read(adata_file_path)
146
+ file_cell_metadata = {
147
+ attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
148
+ }
149
+
150
+ coding_miRNA_loc = np.where(
151
+ [self.genelist_dict.get(i, False) for i in adata.var["ensembl_id"]]
152
+ )[0]
153
+ norm_factor_vector = np.array(
154
+ [
155
+ self.gene_median_dict[i]
156
+ for i in adata.var["ensembl_id"][coding_miRNA_loc]
157
+ ]
158
+ )
159
+ coding_miRNA_ids = adata.var["ensembl_id"][coding_miRNA_loc]
160
+ coding_miRNA_tokens = np.array(
161
+ [self.gene_token_dict[i] for i in coding_miRNA_ids]
162
+ )
163
+
164
+ try:
165
+ adata.obs["filter_pass"]
166
+ except KeyError:
167
+ var_exists = False
168
+ else:
169
+ var_exists = True
170
+
171
+ if var_exists is True:
172
+ filter_pass_loc = np.where(
173
+ [True if i == 1 else False for i in adata.obs["filter_pass"]]
174
+ )[0]
175
+ elif var_exists is False:
176
+ print(
177
+ f"{adata_file_path} has no column attribute 'filter_pass'; tokenizing all cells."
178
+ )
179
+ filter_pass_loc = np.array([i for i in range(adata.shape[0])])
180
+
181
+ tokenized_cells = []
182
+ adata_filter = adata[
183
+ filter_pass_loc, coding_miRNA_loc # filter cells and genes
184
+ ]
185
+
186
+ X_norm = (
187
+ adata_filter.X
188
+ / adata.obs["n_counts"].values.reshape(-1, 1)
189
+ * 10_000
190
+ / norm_factor_vector
191
+ ).tocsr()
192
+
193
+ tokenized_cells += [
194
+ tokenize_cell(X_norm[i, ...].A.flatten(), coding_miRNA_tokens)
195
+ for i in range(X_norm.shape[0])
196
+ ]
197
+
198
+ # add custom attributes for subview to dict
199
+ for k in file_cell_metadata.keys():
200
+ file_cell_metadata[k] += adata_filter.obs[k].tolist()
201
+
202
+ return tokenized_cells, file_cell_metadata
203
+
204
  def tokenize_file(self, loom_file_path):
205
  file_cell_metadata = {
206
  attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
 
242
 
243
  # scan through .loom files and tokenize cells
244
  tokenized_cells = []
245
+ for _ix, _selection, view in data.scan(
246
+ items=filter_pass_loc, axis=1, layers=""
247
+ ):
248
  # select subview with protein-coding and miRNA genes
249
  subview = view.view[coding_miRNA_loc, :]
250
 
 
256
  * 10_000
257
  / norm_factor_vector[:, None]
258
  )
259
+
260
  # tokenize subview gene vectors
261
  tokenized_cells += [
262
  tokenize_cell(subview_norm_array[:, i], coding_miRNA_tokens)