Christina Theodoris commited on
Commit
0960cf6
1 Parent(s): 2f25aea

Add option for modified batch size for loom tokenizer

Browse files
Files changed (1) hide show
  1. geneformer/tokenizer.py +4 -2
geneformer/tokenizer.py CHANGED
@@ -157,7 +157,7 @@ class TranscriptomeTokenizer:
157
  tokenize_file_fn = (
158
  self.tokenize_loom if file_format == "loom" else self.tokenize_anndata
159
  )
160
- for file_path in data_directory.glob("*.{}".format(file_format)):
161
  file_found = 1
162
  print(f"Tokenizing {file_path}")
163
  file_tokenized_cells, file_cell_metadata = tokenize_file_fn(file_path)
@@ -278,7 +278,9 @@ class TranscriptomeTokenizer:
278
 
279
  # scan through .loom files and tokenize cells
280
  tokenized_cells = []
281
- for _ix, _selection, view in data.scan(items=filter_pass_loc, axis=1):
 
 
282
  # select subview with protein-coding and miRNA genes
283
  subview = view.view[coding_miRNA_loc, :]
284
 
 
157
  tokenize_file_fn = (
158
  self.tokenize_loom if file_format == "loom" else self.tokenize_anndata
159
  )
160
+ for file_path in data_directory.glob(f"*.{file_format}"):
161
  file_found = 1
162
  print(f"Tokenizing {file_path}")
163
  file_tokenized_cells, file_cell_metadata = tokenize_file_fn(file_path)
 
278
 
279
  # scan through .loom files and tokenize cells
280
  tokenized_cells = []
281
+ for _ix, _selection, view in data.scan(
282
+ items=filter_pass_loc, axis=1, batch_size=self.chunk_size
283
+ ):
284
  # select subview with protein-coding and miRNA genes
285
  subview = view.view[coding_miRNA_loc, :]
286