Upload tokenize_dataset.py with huggingface_hub
Browse files- tokenize_dataset.py +2 -3
tokenize_dataset.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
# /// script
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
| 4 |
-
# "torch>=2.0.0",
|
| 5 |
# "transformers>=4.50.0",
|
| 6 |
# "datasets>=2.14.0",
|
| 7 |
# "huggingface_hub",
|
|
@@ -218,14 +217,14 @@ def main():
|
|
| 218 |
train_dataset = train_dataset.map(
|
| 219 |
lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
|
| 220 |
remove_columns=["messages"],
|
| 221 |
-
num_proc=
|
| 222 |
desc="Tokenizing train",
|
| 223 |
)
|
| 224 |
|
| 225 |
eval_dataset = eval_dataset.map(
|
| 226 |
lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
|
| 227 |
remove_columns=["messages"],
|
| 228 |
-
num_proc=
|
| 229 |
desc="Tokenizing eval",
|
| 230 |
)
|
| 231 |
|
|
|
|
| 1 |
# /// script
|
| 2 |
# requires-python = ">=3.10"
|
| 3 |
# dependencies = [
|
|
|
|
| 4 |
# "transformers>=4.50.0",
|
| 5 |
# "datasets>=2.14.0",
|
| 6 |
# "huggingface_hub",
|
|
|
|
| 217 |
train_dataset = train_dataset.map(
|
| 218 |
lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
|
| 219 |
remove_columns=["messages"],
|
| 220 |
+
num_proc=1, # Use single process to reduce memory
|
| 221 |
desc="Tokenizing train",
|
| 222 |
)
|
| 223 |
|
| 224 |
eval_dataset = eval_dataset.map(
|
| 225 |
lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
|
| 226 |
remove_columns=["messages"],
|
| 227 |
+
num_proc=1, # Use single process to reduce memory
|
| 228 |
desc="Tokenizing eval",
|
| 229 |
)
|
| 230 |
|