Codyfederer commited on
Commit
8bca88b
·
verified ·
1 Parent(s): 9a0c826

Upload tokenize_dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenize_dataset.py +2 -3
tokenize_dataset.py CHANGED
@@ -1,7 +1,6 @@
1
  # /// script
2
  # requires-python = ">=3.10"
3
  # dependencies = [
4
- # "torch>=2.0.0",
5
  # "transformers>=4.50.0",
6
  # "datasets>=2.14.0",
7
  # "huggingface_hub",
@@ -218,14 +217,14 @@ def main():
218
  train_dataset = train_dataset.map(
219
  lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
220
  remove_columns=["messages"],
221
- num_proc=4,
222
  desc="Tokenizing train",
223
  )
224
 
225
  eval_dataset = eval_dataset.map(
226
  lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
227
  remove_columns=["messages"],
228
- num_proc=4,
229
  desc="Tokenizing eval",
230
  )
231
 
 
1
  # /// script
2
  # requires-python = ">=3.10"
3
  # dependencies = [
 
4
  # "transformers>=4.50.0",
5
  # "datasets>=2.14.0",
6
  # "huggingface_hub",
 
217
  train_dataset = train_dataset.map(
218
  lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
219
  remove_columns=["messages"],
220
+ num_proc=1, # Use single process to reduce memory
221
  desc="Tokenizing train",
222
  )
223
 
224
  eval_dataset = eval_dataset.map(
225
  lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
226
  remove_columns=["messages"],
227
+ num_proc=1, # Use single process to reduce memory
228
  desc="Tokenizing eval",
229
  )
230