Christina Theodoris
commited on
Commit
•
e3330a6
1
Parent(s):
d1931b1
edit docstring format to highlight options
Browse files- geneformer/tokenizer.py +10 -10
geneformer/tokenizer.py
CHANGED
@@ -94,14 +94,14 @@ class TranscriptomeTokenizer:
|
|
94 |
| Keys are the names of the attributes in the loom file.
|
95 |
| Values are the names of the attributes in the dataset.
|
96 |
nproc : int
|
97 |
-
Number of processes to use for dataset mapping.
|
98 |
chunk_size: int = 512
|
99 |
-
Chunk size for anndata tokenizer.
|
100 |
gene_median_file : Path
|
101 |
-
Path to pickle file containing dictionary of non-zero median
|
102 |
-
gene expression values across Genecorpus-30M.
|
103 |
token_dictionary_file : Path
|
104 |
-
Path to pickle file containing token dictionary (Ensembl IDs:token).
|
105 |
"""
|
106 |
# dictionary of custom attributes {output dataset column name: input .loom column name}
|
107 |
self.custom_attr_name_dict = custom_attr_name_dict
|
@@ -141,15 +141,15 @@ class TranscriptomeTokenizer:
|
|
141 |
**Parameters:**
|
142 |
|
143 |
data_directory : Path
|
144 |
-
Path to directory containing loom files or anndata files
|
145 |
output_directory : Path
|
146 |
-
Path to directory where tokenized data will be saved as .dataset
|
147 |
output_prefix : str
|
148 |
-
Prefix for output .dataset
|
149 |
file_format : str
|
150 |
-
Format of input files. Can be "loom" or "h5ad".
|
151 |
use_generator : bool
|
152 |
-
Whether to use generator or dict for tokenization.
|
153 |
"""
|
154 |
tokenized_cells, cell_metadata = self.tokenize_files(
|
155 |
Path(data_directory), file_format
|
|
|
94 |
| Keys are the names of the attributes in the loom file.
|
95 |
| Values are the names of the attributes in the dataset.
|
96 |
nproc : int
|
97 |
+
| Number of processes to use for dataset mapping.
|
98 |
chunk_size: int = 512
|
99 |
+
| Chunk size for anndata tokenizer.
|
100 |
gene_median_file : Path
|
101 |
+
| Path to pickle file containing dictionary of non-zero median
|
102 |
+
| gene expression values across Genecorpus-30M.
|
103 |
token_dictionary_file : Path
|
104 |
+
| Path to pickle file containing token dictionary (Ensembl IDs:token).
|
105 |
"""
|
106 |
# dictionary of custom attributes {output dataset column name: input .loom column name}
|
107 |
self.custom_attr_name_dict = custom_attr_name_dict
|
|
|
141 |
**Parameters:**
|
142 |
|
143 |
data_directory : Path
|
144 |
+
| Path to directory containing loom files or anndata files
|
145 |
output_directory : Path
|
146 |
+
| Path to directory where tokenized data will be saved as .dataset
|
147 |
output_prefix : str
|
148 |
+
| Prefix for output .dataset
|
149 |
file_format : str
|
150 |
+
| Format of input files. Can be "loom" or "h5ad".
|
151 |
use_generator : bool
|
152 |
+
| Whether to use generator or dict for tokenization.
|
153 |
"""
|
154 |
tokenized_cells, cell_metadata = self.tokenize_files(
|
155 |
Path(data_directory), file_format
|