Christina Theodoris commited on
Commit
f0b6641
1 Parent(s): feeecd0

Update links including unsorted example lengths file

Browse files
examples/pretraining_new_model/pretrain_geneformer_w_deepspeed.py CHANGED
@@ -99,7 +99,7 @@ subprocess.call(f"mkdir {training_output_dir}", shell=True)
99
  subprocess.call(f"mkdir {model_output_dir}", shell=True)
100
 
101
 
102
- # load gene_ensembl_id:token dictionary (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/datasets/token_dictionary.pkl)
103
  with open("token_dictionary.pkl", "rb") as fp:
104
  token_dictionary = pickle.load(fp)
105
 
@@ -153,8 +153,8 @@ trainer = GeneformerPretrainer(
153
  args=training_args,
154
  # pretraining corpus (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/genecorpus_30M_2048.dataset)
155
  train_dataset=load_from_disk("genecorpus_30M_2048.dataset"),
156
- # file of lengths of each example cell (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/genecorpus_30M_2048_sorted_lengths.pkl)
157
- example_lengths_file="genecorpus_30M_2048_sorted_lengths.pkl",
158
  token_dictionary=token_dictionary,
159
  )
160
 
99
  subprocess.call(f"mkdir {model_output_dir}", shell=True)
100
 
101
 
102
+ # load gene_ensembl_id:token dictionary (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/blob/main/token_dictionary.pkl)
103
  with open("token_dictionary.pkl", "rb") as fp:
104
  token_dictionary = pickle.load(fp)
105
 
153
  args=training_args,
154
  # pretraining corpus (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/genecorpus_30M_2048.dataset)
155
  train_dataset=load_from_disk("genecorpus_30M_2048.dataset"),
156
+ # file of lengths of each example cell (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/blob/main/genecorpus_30M_2048_lengths.pkl)
157
+ example_lengths_file="genecorpus_30M_2048_lengths.pkl",
158
  token_dictionary=token_dictionary,
159
  )
160