Codyfederer commited on
Commit
9a0c826
·
verified ·
1 Parent(s): 38d7272

Upload tokenize_dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenize_dataset.py +308 -0
tokenize_dataset.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "torch>=2.0.0",
5
+ # "transformers>=4.50.0",
6
+ # "datasets>=2.14.0",
7
+ # "huggingface_hub",
8
+ # ]
9
+ # ///
10
+ """
11
+ Tokenize Dataset Script: Prepare Tool Calling Dataset for Training
12
+
13
+ This script tokenizes the nvidia/Nemotron-Agentic-v1 tool_calling dataset
14
+ and uploads it to HuggingFace Hub for reuse.
15
+
16
+ Usage:
17
+ uv run tokenize_dataset.py
18
+
19
+ Can run on CPU - no GPU required!
20
+ """
21
+
22
+ import os
23
+ import json
24
+ from datasets import load_dataset, Dataset
25
+ from transformers import AutoTokenizer
26
+ from huggingface_hub import hf_hub_download, HfApi, create_repo
27
+
28
+ # ============================================================================
29
+ # CONFIGURATION
30
+ # ============================================================================
31
+
32
+ # Model to get tokenizer from
33
+ BASE_MODEL = "Tesslate/Synthia-S1-27b"
34
+
35
+ # Source dataset
36
+ DATASET_NAME = "nvidia/Nemotron-Agentic-v1"
37
+ DATASET_SPLIT = "tool_calling"
38
+
39
+ # Output tokenized dataset
40
+ TOKENIZED_DATASET_REPO = "Codyfederer/synthia-tool-calling-tokenized"
41
+ TOKENIZED_DATASET_PRIVATE = True
42
+
43
+ # Tokenization settings
44
+ MAX_SEQ_LENGTH = 4096
45
+
46
+ # ============================================================================
47
+ # TOKENIZATION FUNCTIONS
48
+ # ============================================================================
49
+
50
+ def tokenize_conversation(example, tokenizer, max_length):
51
+ """
52
+ Tokenize a conversation using the model's chat template.
53
+ Returns input_ids, attention_mask, and labels for causal LM training.
54
+ """
55
+ messages = example["messages"]
56
+
57
+ # Apply chat template to get the full text
58
+ text = tokenizer.apply_chat_template(
59
+ messages,
60
+ tokenize=False,
61
+ add_generation_prompt=False
62
+ )
63
+
64
+ # Tokenize the text
65
+ tokenized = tokenizer(
66
+ text,
67
+ truncation=True,
68
+ max_length=max_length,
69
+ padding=False,
70
+ return_tensors=None,
71
+ )
72
+
73
+ # For causal LM, labels are the same as input_ids
74
+ tokenized["labels"] = tokenized["input_ids"].copy()
75
+
76
+ return tokenized
77
+
78
+
79
+ def main():
80
+ print("=" * 60)
81
+ print("Tokenize Dataset for Tool Calling Training")
82
+ print("=" * 60)
83
+
84
+ # Get HF username
85
+ from huggingface_hub import whoami
86
+ try:
87
+ username = whoami()["name"]
88
+ print(f"Logged in as: {username}")
89
+ except Exception as e:
90
+ print(f"ERROR: Not logged in to HF Hub ({e})")
91
+ print("Run 'huggingface-cli login' first")
92
+ return
93
+
94
+ # -------------------------------------------------------------------------
95
+ # Load Tokenizer
96
+ # -------------------------------------------------------------------------
97
+ print(f"\nLoading tokenizer from {BASE_MODEL}...")
98
+
99
+ tokenizer = AutoTokenizer.from_pretrained(
100
+ BASE_MODEL,
101
+ trust_remote_code=True,
102
+ padding_side="right",
103
+ )
104
+
105
+ if tokenizer.pad_token is None:
106
+ tokenizer.pad_token = tokenizer.eos_token
107
+ tokenizer.pad_token_id = tokenizer.eos_token_id
108
+
109
+ print(f"Vocab size: {len(tokenizer):,}")
110
+
111
+ # -------------------------------------------------------------------------
112
+ # Load Source Dataset
113
+ # -------------------------------------------------------------------------
114
+ print(f"\nLoading dataset: {DATASET_NAME} ({DATASET_SPLIT} split)...")
115
+
116
+ # Download the JSONL file
117
+ jsonl_file = f"data/{DATASET_SPLIT}.jsonl"
118
+ print(f"Downloading {jsonl_file}...")
119
+
120
+ local_path = hf_hub_download(
121
+ repo_id=DATASET_NAME,
122
+ filename=jsonl_file,
123
+ repo_type="dataset"
124
+ )
125
+ print(f"Downloaded to: {local_path}")
126
+
127
+ # Load and process JSONL
128
+ print("Loading and processing JSONL file...")
129
+ processed_examples = []
130
+ skipped = 0
131
+
132
+ with open(local_path, 'r', encoding='utf-8') as f:
133
+ for line_num, line in enumerate(f):
134
+ if line_num % 50000 == 0:
135
+ print(f" Processed {line_num:,} lines...")
136
+ try:
137
+ example = json.loads(line.strip())
138
+ messages = example.get("messages", [])
139
+
140
+ # Convert messages to consistent format
141
+ formatted_messages = []
142
+ for msg in messages:
143
+ role = msg.get("role", "user")
144
+ content = msg.get("content", "")
145
+
146
+ # Handle content that might be a list or complex object
147
+ if isinstance(content, list):
148
+ parts = []
149
+ for item in content:
150
+ if isinstance(item, dict):
151
+ if "text" in item:
152
+ parts.append(item["text"])
153
+ else:
154
+ parts.append(json.dumps(item))
155
+ else:
156
+ parts.append(str(item))
157
+ content = "\n".join(parts) if parts else ""
158
+ elif isinstance(content, dict):
159
+ content = json.dumps(content)
160
+ elif content is None:
161
+ content = ""
162
+ else:
163
+ content = str(content)
164
+
165
+ formatted_messages.append({
166
+ "role": role,
167
+ "content": content
168
+ })
169
+
170
+ # Merge consecutive messages with same role
171
+ if formatted_messages:
172
+ merged_messages = []
173
+ for msg in formatted_messages:
174
+ role = msg["role"]
175
+ content = msg["content"]
176
+
177
+ # Map tool role to user
178
+ if role == "tool":
179
+ role = "user"
180
+ content = f"[Tool Result]\n{content}"
181
+
182
+ if merged_messages and merged_messages[-1]["role"] == role:
183
+ merged_messages[-1]["content"] += f"\n\n{content}"
184
+ else:
185
+ merged_messages.append({"role": role, "content": content})
186
+
187
+ # Ensure conversation starts with user
188
+ if merged_messages and merged_messages[0]["role"] != "user":
189
+ merged_messages.insert(0, {"role": "user", "content": "[Start]"})
190
+
191
+ processed_examples.append({"messages": merged_messages})
192
+
193
+ except Exception as e:
194
+ skipped += 1
195
+ if skipped < 5:
196
+ print(f" Warning: Skipped line {line_num}: {e}")
197
+
198
+ print(f"Loaded {len(processed_examples):,} examples (skipped {skipped})")
199
+
200
+ # Create dataset
201
+ dataset = Dataset.from_list(processed_examples)
202
+ print(f"Dataset size: {len(dataset):,} examples")
203
+
204
+ # Create train/eval split
205
+ split_dataset = dataset.train_test_split(test_size=0.02, seed=42)
206
+ train_dataset = split_dataset["train"]
207
+ eval_dataset = split_dataset["test"]
208
+
209
+ print(f"Train samples: {len(train_dataset):,}")
210
+ print(f"Eval samples: {len(eval_dataset):,}")
211
+
212
+ # -------------------------------------------------------------------------
213
+ # Tokenize Dataset
214
+ # -------------------------------------------------------------------------
215
+ print(f"\nTokenizing dataset with max_length={MAX_SEQ_LENGTH}...")
216
+ print("This may take a while for large datasets...")
217
+
218
+ train_dataset = train_dataset.map(
219
+ lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
220
+ remove_columns=["messages"],
221
+ num_proc=4,
222
+ desc="Tokenizing train",
223
+ )
224
+
225
+ eval_dataset = eval_dataset.map(
226
+ lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
227
+ remove_columns=["messages"],
228
+ num_proc=4,
229
+ desc="Tokenizing eval",
230
+ )
231
+
232
+ print(f"Tokenization complete!")
233
+ print(f"Train dataset columns: {train_dataset.column_names}")
234
+ print(f"Sample input_ids length: {len(train_dataset[0]['input_ids'])}")
235
+
236
+ # -------------------------------------------------------------------------
237
+ # Upload to Hub
238
+ # -------------------------------------------------------------------------
239
+ print(f"\nUploading TOKENIZED dataset to Hub: {TOKENIZED_DATASET_REPO}")
240
+
241
+ # Create repo
242
+ api = HfApi()
243
+ try:
244
+ create_repo(
245
+ TOKENIZED_DATASET_REPO,
246
+ repo_type="dataset",
247
+ private=TOKENIZED_DATASET_PRIVATE,
248
+ exist_ok=True
249
+ )
250
+ print(f" Created/verified repo (private={TOKENIZED_DATASET_PRIVATE})")
251
+
252
+ if TOKENIZED_DATASET_PRIVATE:
253
+ try:
254
+ api.update_repo_visibility(
255
+ TOKENIZED_DATASET_REPO,
256
+ repo_type="dataset",
257
+ private=True
258
+ )
259
+ except Exception:
260
+ pass
261
+ except Exception as e:
262
+ print(f" Repo note: {e}")
263
+
264
+ # Reset format for serialization
265
+ train_dataset.reset_format()
266
+ eval_dataset.reset_format()
267
+
268
+ # Verify data
269
+ print(f" Verifying tokenized data...")
270
+ print(f" Train columns: {train_dataset.column_names}")
271
+ print(f" Sample input_ids type: {type(train_dataset[0]['input_ids'])}")
272
+ print(f" Sample input_ids length: {len(train_dataset[0]['input_ids'])}")
273
+ print(f" First 10 tokens: {train_dataset[0]['input_ids'][:10]}")
274
+
275
+ # Push to Hub
276
+ print(f" Pushing train split ({len(train_dataset):,} examples)...")
277
+ train_dataset.push_to_hub(
278
+ TOKENIZED_DATASET_REPO,
279
+ split="train",
280
+ )
281
+
282
+ print(f" Pushing test split ({len(eval_dataset):,} examples)...")
283
+ eval_dataset.push_to_hub(
284
+ TOKENIZED_DATASET_REPO,
285
+ split="test",
286
+ )
287
+
288
+ print(f"\n" + "=" * 60)
289
+ print(f"SUCCESS! Tokenized dataset saved to:")
290
+ print(f" https://huggingface.co/datasets/{TOKENIZED_DATASET_REPO}")
291
+ print(f"=" * 60)
292
+
293
+ # Verify upload
294
+ print("\nVerifying upload...")
295
+ try:
296
+ from datasets import load_dataset as verify_load
297
+ verify_ds = verify_load(TOKENIZED_DATASET_REPO, split="train", streaming=True)
298
+ sample = next(iter(verify_ds))
299
+ if "input_ids" in sample:
300
+ print(f" VERIFIED: Dataset contains input_ids with {len(sample['input_ids'])} tokens")
301
+ else:
302
+ print(f" WARNING: input_ids not found in columns: {list(sample.keys())}")
303
+ except Exception as ve:
304
+ print(f" Could not verify: {ve}")
305
+
306
+
307
+ if __name__ == "__main__":
308
+ main()