Cludoy commited on
Commit
6c56621
Β·
verified Β·
1 Parent(s): 0a7ac4c

Add dataset_generator.py

Browse files
Files changed (1) hide show
  1. dataset_generator.py +512 -0
dataset_generator.py ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dataset Generation Pipeline for TinyBert-CNN Intent Classifier.
3
+ Generates (student_input, session_context, label) triples for 5-class classification.
4
+ """
5
+
6
+ import random
7
+ import pandas as pd
8
+ import os
9
+ import re
10
+
11
+ # ─────────────────────────────────────────────────────────────────────
12
+ # CONSTANTS & METADATA
13
+ # ─────────────────────────────────────────────────────────────────────
14
+
15
+ PYTHON_TOPICS = [
16
+ "Variables and Data Types",
17
+ "Strings and Formatting",
18
+ "Arithmetic Operators",
19
+ "Boolean Logic",
20
+ "If/Else Conditionals",
21
+ "For Loops",
22
+ "While Loops",
23
+ "Lists and Tuples",
24
+ "Dictionaries",
25
+ "Sets",
26
+ "Functions and Scope",
27
+ "Lambda Functions",
28
+ "Error Handling (Try/Except)",
29
+ "Classes and OOP",
30
+ "File Handling"
31
+ ]
32
+
33
+ LABEL_MAP = {
34
+ 'On-Topic Question': 0,
35
+ 'Off-Topic Question': 1,
36
+ 'Emotional-State': 2,
37
+ 'Pace-Related': 3,
38
+ 'Repeat/clarification': 4
39
+ }
40
+
41
+ EMOTIONS = ["neutral", "engaged", "focused", "frustrated", "confused", "bored", "tired", "anxious", "excited", "overwhelmed"]
42
+ PACES = ["normal", "fast", "slow", "rushed", "dragging", "moderate", "steady"]
43
+
44
+ # ─────────────────────────────────────────────────────────────────────
45
+ # CONTEXT GENERATION (Compact key-value format)
46
+ # ─────────────────────────────────────────────────────────────────────
47
+
48
+ def generate_session_context(current_topic_idx):
49
+ """Generates a compact session context string."""
50
+ current_topic = PYTHON_TOPICS[current_topic_idx]
51
+
52
+ if current_topic_idx > 0:
53
+ prev_count = min(3, current_topic_idx)
54
+ prev_topics = PYTHON_TOPICS[current_topic_idx - prev_count : current_topic_idx]
55
+ else:
56
+ prev_topics = []
57
+
58
+ # Ability scores for previous topics
59
+ abilities = []
60
+ for pt in prev_topics:
61
+ short_name = pt.split("(")[0].strip().replace(" and ", "&")
62
+ score = random.randint(30, 100)
63
+ abilities.append(f"{short_name}:{score}%")
64
+
65
+ ability_str = ",".join(abilities) if abilities else "N/A"
66
+ prev_str = ",".join([t.split("(")[0].strip() for t in prev_topics]) if prev_topics else "None"
67
+ emotion = random.choice(EMOTIONS)
68
+ pace = random.choice(PACES)
69
+ slide = random.randint(5, 60)
70
+
71
+ context = (
72
+ f"topic:{current_topic} | "
73
+ f"prev:{prev_str} | "
74
+ f"ability:{ability_str} | "
75
+ f"emotion:{emotion} | "
76
+ f"pace:{pace} | "
77
+ f"slides:{slide-1},{slide},{slide+1}"
78
+ )
79
+ return context, current_topic, prev_topics
80
+
81
+
82
+ # ─────────────────────────────────────────────────────────────────────
83
+ # EXPANDED TEMPLATE BANKS (40+ per class)
84
+ # ─────────────────────────────────────────────────────────────────────
85
+
86
+ ON_TOPIC_TEMPLATES = [
87
+ # Direct questions
88
+ "How do I use {topic} in my code?",
89
+ "Can you explain {topic} again?",
90
+ "What are the best practices for {topic}?",
91
+ "Can you show me an example of {topic}?",
92
+ "Why is {topic} giving me a syntax error?",
93
+ "Is there a different way to write {topic}?",
94
+ "I don't get the part about {topic}.",
95
+ "Can we do another exercise for {topic}?",
96
+ "What happens if I forget to close the bracket in {topic}?",
97
+ "How is {topic} different from the previous topic?",
98
+ # Conceptual questions
99
+ "Why do we need {topic}?",
100
+ "When should I use {topic} vs the other approach?",
101
+ "What's the point of {topic}?",
102
+ "Is {topic} used a lot in real projects?",
103
+ "Can you give me a real-world example of {topic}?",
104
+ "Does {topic} work the same way in other languages?",
105
+ # Problem-solving
106
+ "I'm stuck on this challenge about {topic}.",
107
+ "My code for {topic} isn't working, can you help?",
108
+ "I keep getting an error with {topic}.",
109
+ "Why does my {topic} code print the wrong output?",
110
+ "What am I doing wrong with {topic}?",
111
+ "Can you debug this {topic} example with me?",
112
+ # Clarification on current material
113
+ "What did you mean when you said {topic} works like that?",
114
+ "Can you go deeper into {topic}?",
115
+ "Is there more to know about {topic}?",
116
+ "How does {topic} connect to what we learned before?",
117
+ "What's the difference between the two approaches you showed for {topic}?",
118
+ "Can you break down {topic} step by step?",
119
+ # Practical application
120
+ "How would I use {topic} in a project?",
121
+ "Can I combine {topic} with what we learned earlier?",
122
+ "Is {topic} something I'll use every day?",
123
+ "Where does {topic} fit in a larger program?",
124
+ "Can you show me a more advanced use of {topic}?",
125
+ # Short/informal
126
+ "Tell me more about {topic}",
127
+ "What's {topic} again?",
128
+ "{topic} is confusing",
129
+ "Help me with {topic}",
130
+ "I need help understanding {topic}",
131
+ "So how does {topic} actually work?",
132
+ "Wait, explain {topic} one more time",
133
+ ]
134
+
135
+ # Context-aware on-topic templates (reference ability scores, prev topics)
136
+ ON_TOPIC_CONTEXT_TEMPLATES = [
137
+ "You said I scored low on {prev_topic}, does that affect how I should approach {topic}?",
138
+ "Since I did well on {prev_topic}, is {topic} going to be similar?",
139
+ "How does {prev_topic} relate to {topic}?",
140
+ "I understood {prev_topic} but {topic} feels completely different, why?",
141
+ "Can we review {prev_topic} briefly before diving deeper into {topic}?",
142
+ "My score on {prev_topic} was not great, will I need it for {topic}?",
143
+ ]
144
+
145
+ OFF_TOPIC_GENERAL = [
146
+ "What's the weather like today?",
147
+ "How do I cook pasta?",
148
+ "Who won the soccer match last night?",
149
+ "Can you recommend a good movie to watch?",
150
+ "What is the capital of France?",
151
+ "How much does a new car cost?",
152
+ "Do you like listening to music?",
153
+ "Tell me a joke.",
154
+ "I'm feeling hungry, should I order pizza?",
155
+ "What is your favorite color?",
156
+ "What time is it?",
157
+ "Do you know any good restaurants nearby?",
158
+ "Who is the president of the United States?",
159
+ "What's the best phone to buy right now?",
160
+ "Can you help me with my math homework?",
161
+ "How tall is the Eiffel Tower?",
162
+ "What should I eat for dinner?",
163
+ "Do you watch Netflix?",
164
+ "What's the meaning of life?",
165
+ "How do I fix my car?",
166
+ ]
167
+
168
+ OFF_TOPIC_FUTURE_TOPIC_TEMPLATES = [
169
+ "Are we going to learn about {topic} soon?",
170
+ "What is {topic} exactly?",
171
+ "I heard about {topic}, can you explain it to me?",
172
+ "How does {topic} work in Python?",
173
+ "Can we skip ahead to {topic}?",
174
+ "Is {topic} hard to learn?",
175
+ "I saw someone using {topic}, what does it do?",
176
+ "Do we need to know about {topic}?",
177
+ "When will we cover {topic}?",
178
+ "My friend told me {topic} is important, is that true?",
179
+ "Will {topic} be on the exam?",
180
+ "Can you give me a sneak peek of {topic}?",
181
+ "I already know a bit about {topic}, can we jump to it?",
182
+ "How long until we get to {topic}?",
183
+ "Is {topic} related to what we are doing now?",
184
+ ]
185
+
186
+ EMOTIONAL_TEMPLATES = [
187
+ # Frustration
188
+ "I am so frustrated right now.",
189
+ "This is making me really angry.",
190
+ "I can't take this anymore.",
191
+ "I feel like giving up.",
192
+ "Nothing makes sense to me.",
193
+ "I'm losing my patience.",
194
+ "Why is this so hard?",
195
+ "I feel stupid for not getting this.",
196
+ # Positive
197
+ "This is really starting to make sense!",
198
+ "I love coding, this is fun!",
199
+ "Wow, I finally understand it!",
200
+ "I am ready to tackle the next challenge!",
201
+ "This is getting exciting!",
202
+ "I feel so good about this now.",
203
+ "I'm having a great time learning this.",
204
+ "That was actually easier than I thought.",
205
+ # Confusion
206
+ "I feel completely stuck and confused.",
207
+ "I have no idea what's going on.",
208
+ "My brain is fried.",
209
+ "I'm lost.",
210
+ "I don't understand anything.",
211
+ "This is so confusing it hurts.",
212
+ # Boredom / tiredness
213
+ "This is getting boring.",
214
+ "I'm feeling super tired today.",
215
+ "My head hurts from all this information.",
216
+ "I feel like I'm not making any progress.",
217
+ "Can we do something more interesting?",
218
+ "I'm so sleepy right now.",
219
+ "This is not engaging at all.",
220
+ "My eyes are glazing over.",
221
+ # Anxiety
222
+ "I'm nervous about the upcoming test.",
223
+ "What if I fail?",
224
+ "I feel anxious about falling behind.",
225
+ "Everyone else seems to get it except me.",
226
+ "I'm stressed out.",
227
+ # Mixed / ambiguous (touches emotional + other intents)
228
+ "I'm confused, I feel so dumb right now.",
229
+ "I'm excited but also scared I'll mess up.",
230
+ "I'm frustrated because this used to make sense.",
231
+ "I feel overwhelmed by all this new stuff.",
232
+ "I just feel really down today.",
233
+ ]
234
+
235
+ PACE_TEMPLATES = [
236
+ # Slow down
237
+ "Can we slow down a bit?",
238
+ "You are going way too fast.",
239
+ "Wait, can you slow down the explanation?",
240
+ "I need more time to process this.",
241
+ "Can you wait a second before moving to the next slide?",
242
+ "Hold on, I'm still writing notes.",
243
+ "Please slow down, I can't keep up.",
244
+ "You're moving too quickly for me.",
245
+ "I need a moment to think about this.",
246
+ "Can we pause for a minute?",
247
+ "Don't rush through this please.",
248
+ "Slow down, I'm still on the last example.",
249
+ "Give me a sec, I'm still processing.",
250
+ # Speed up
251
+ "Let's move on to the next topic.",
252
+ "Can we skip this?",
253
+ "I think I got this, let's speed up.",
254
+ "Can we go through the next part faster?",
255
+ "Let's speed up the pace, I'm bored.",
256
+ "I already know this, can we move on?",
257
+ "This part is easy, let's go faster.",
258
+ "Skip ahead please.",
259
+ "Next topic please.",
260
+ "We're spending too long on this.",
261
+ "Can we pick up the pace?",
262
+ # Break / timing
263
+ "Can we take a break?",
264
+ "How much time do we have left?",
265
+ "When does this session end?",
266
+ "I need a 5 minute break.",
267
+ "Let's take a quick breather.",
268
+ # General pacing
269
+ "The pace feels about right.",
270
+ "Can you adjust the speed a bit?",
271
+ "I think the pacing is off.",
272
+ "Are we on schedule?",
273
+ "How many more slides do we have?",
274
+ ]
275
+
276
+ REPEAT_TEMPLATES = [
277
+ "Can you repeat that last part?",
278
+ "What did you say about the slide right before this one?",
279
+ "Could you clarify what you meant?",
280
+ "I didn't catch that, can you say it again?",
281
+ "Say that again?",
282
+ "Can you go back to the previous slide for a second?",
283
+ "I missed the first step, can you re-explain?",
284
+ "Can you repeat the rule for that?",
285
+ "Could you run through the explanation one more time?",
286
+ "Can you clarify the difference between the two examples?",
287
+ "Wait, what was that?",
288
+ "Huh? Can you repeat?",
289
+ "I didn't understand, please say it again.",
290
+ "Sorry, I zoned out. What did you just say?",
291
+ "Come again?",
292
+ "Can you show that example one more time?",
293
+ "Go back to that last point please.",
294
+ "I need you to repeat the definition.",
295
+ "What was the syntax you just showed?",
296
+ "Can you re-explain how that works?",
297
+ "I lost you there, can you start over on that point?",
298
+ "Please repeat the steps.",
299
+ "Sorry, can you go over that again from the beginning?",
300
+ "What was the output of that code again?",
301
+ "Can you re-run that example?",
302
+ "I missed it, one more time please.",
303
+ "I need to hear that explanation again.",
304
+ "Can you walk me through that once more?",
305
+ "Let me see that slide again.",
306
+ "I need a recap of what you just said.",
307
+ "Can you summarize what you just explained?",
308
+ "What were the key points of that last section?",
309
+ ]
310
+
311
+
312
+ # ─────────────────────────────────────────────────────────────────────
313
+ # AUGMENTATION STRATEGIES
314
+ # ─────────────────────────────────────────────────────────────────────
315
+
316
+ SYNONYM_MAP = {
317
+ "explain": ["describe", "clarify", "elaborate on", "break down", "walk me through"],
318
+ "show": ["demonstrate", "present", "display", "give me"],
319
+ "help": ["assist", "support", "aid"],
320
+ "use": ["utilize", "apply", "work with"],
321
+ "understand": ["get", "grasp", "comprehend", "follow"],
322
+ "repeat": ["say again", "go over again", "redo", "recap"],
323
+ "confused": ["lost", "puzzled", "unsure", "baffled"],
324
+ "stuck": ["blocked", "stalled", "unable to proceed"],
325
+ "slow down": ["take it easy", "go slower", "ease up"],
326
+ "speed up": ["go faster", "pick up the pace", "hurry up"],
327
+ "example": ["demo", "sample", "illustration", "instance"],
328
+ "error": ["bug", "mistake", "issue", "problem"],
329
+ "different": ["alternative", "another", "other"],
330
+ "code": ["program", "script", "snippet"],
331
+ }
332
+
333
+ FILLERS = ["umm", "so", "like", "hey", "well", "basically", "honestly", "actually", "ok so", "right"]
334
+
335
+ def augment_synonym(text):
336
+ """Replace one random word with a synonym."""
337
+ for word, synonyms in SYNONYM_MAP.items():
338
+ if word in text.lower() and random.random() < 0.35:
339
+ pattern = re.compile(re.escape(word), re.IGNORECASE)
340
+ text = pattern.sub(random.choice(synonyms), text, count=1)
341
+ break
342
+ return text
343
+
344
+ def augment_case(text):
345
+ """Randomly change casing."""
346
+ r = random.random()
347
+ if r < 0.3:
348
+ return text.lower()
349
+ if r < 0.38:
350
+ return text.upper()
351
+ return text
352
+
353
+ def augment_punctuation(text):
354
+ """Randomly alter punctuation."""
355
+ r = random.random()
356
+ if r < 0.25:
357
+ return text.rstrip("?!.") + "?"
358
+ if r < 0.4:
359
+ return text.rstrip("?!.")
360
+ if r < 0.48:
361
+ return text.rstrip("?!.") + "!!"
362
+ return text
363
+
364
+ def augment_filler(text):
365
+ """Randomly prepend a filler word."""
366
+ if random.random() < 0.2:
367
+ return random.choice(FILLERS) + " " + text
368
+ return text
369
+
370
+ def augment_typo(text, prob=0.08):
371
+ """Inject character-level typos."""
372
+ if random.random() > 0.35:
373
+ return text
374
+ chars = list(text)
375
+ for i in range(len(chars)):
376
+ if random.random() < prob and chars[i].isalpha():
377
+ op = random.choice(["swap", "delete", "duplicate"])
378
+ if op == "swap" and i < len(chars) - 1:
379
+ chars[i], chars[i+1] = chars[i+1], chars[i]
380
+ elif op == "delete":
381
+ chars[i] = ""
382
+ elif op == "duplicate":
383
+ chars[i] = chars[i] * 2
384
+ return "".join(chars)
385
+
386
+ def augment_word_swap(text):
387
+ """Swap two adjacent words."""
388
+ words = text.split()
389
+ if len(words) <= 2 or random.random() > 0.15:
390
+ return text
391
+ idx = random.randint(0, len(words) - 2)
392
+ words[idx], words[idx+1] = words[idx+1], words[idx]
393
+ return " ".join(words)
394
+
395
+ def augment_word_delete(text):
396
+ """Delete a random non-essential word."""
397
+ words = text.split()
398
+ if len(words) <= 3 or random.random() > 0.12:
399
+ return text
400
+ idx = random.randint(1, len(words) - 2)
401
+ words.pop(idx)
402
+ return " ".join(words)
403
+
404
+ def augment_text(text):
405
+ """Apply a random combination of augmentation strategies."""
406
+ strategies = [augment_synonym, augment_case, augment_punctuation,
407
+ augment_filler, augment_typo, augment_word_swap, augment_word_delete]
408
+ # Apply 1-3 random strategies
409
+ chosen = random.sample(strategies, k=random.randint(1, 3))
410
+ for fn in chosen:
411
+ text = fn(text)
412
+ return text.strip()
413
+
414
+
415
+ # ─────────────────────────────────────────────────────────────────────
416
+ # INTENT GENERATORS
417
+ # ─────────────────────────────────────────────────────────────────────
418
+
419
+ def get_on_topic_question(current_topic, prev_topics):
420
+ # 20% chance of context-aware template if prev_topics exist
421
+ if prev_topics and random.random() < 0.2:
422
+ prev_topic = random.choice(prev_topics)
423
+ template = random.choice(ON_TOPIC_CONTEXT_TEMPLATES)
424
+ return template.replace("{topic}", current_topic).replace("{prev_topic}", prev_topic)
425
+ template = random.choice(ON_TOPIC_TEMPLATES)
426
+ return template.replace("{topic}", current_topic)
427
+
428
+ def get_off_topic_question(current_topic_idx):
429
+ if current_topic_idx < len(PYTHON_TOPICS) - 1 and random.random() < 0.5:
430
+ future_topic = random.choice(PYTHON_TOPICS[current_topic_idx + 1:])
431
+ template = random.choice(OFF_TOPIC_FUTURE_TOPIC_TEMPLATES)
432
+ return template.replace("{topic}", future_topic)
433
+ return random.choice(OFF_TOPIC_GENERAL)
434
+
435
+ def get_emotional_state():
436
+ return random.choice(EMOTIONAL_TEMPLATES)
437
+
438
+ def get_pace_related():
439
+ return random.choice(PACE_TEMPLATES)
440
+
441
+ def get_repeat_clarification():
442
+ return random.choice(REPEAT_TEMPLATES)
443
+
444
+
445
+ # ─────────────────────────────────────────────────────────────────────
446
+ # PIPELINE GENERATION (3-way split: train/val/test)
447
+ # ─────────────────────────────────────────────────────────────────────
448
+
449
+ def build_dataset(num_samples_per_class=2000, train_ratio=0.70, val_ratio=0.15, test_ratio=0.15):
450
+ print(f"Starting Dataset Generation ({num_samples_per_class} per class)...")
451
+
452
+ dataset = []
453
+
454
+ for intent, label_id in LABEL_MAP.items():
455
+ for _ in range(num_samples_per_class):
456
+ topic_idx = random.randint(0, len(PYTHON_TOPICS) - 1)
457
+ context_str, current_topic, prev_topics = generate_session_context(topic_idx)
458
+
459
+ if intent == 'On-Topic Question':
460
+ student_input = get_on_topic_question(current_topic, prev_topics)
461
+ elif intent == 'Off-Topic Question':
462
+ student_input = get_off_topic_question(topic_idx)
463
+ elif intent == 'Emotional-State':
464
+ student_input = get_emotional_state()
465
+ elif intent == 'Pace-Related':
466
+ student_input = get_pace_related()
467
+ elif intent == 'Repeat/clarification':
468
+ student_input = get_repeat_clarification()
469
+ else:
470
+ student_input = get_off_topic_question(topic_idx)
471
+
472
+ student_input = augment_text(student_input)
473
+
474
+ dataset.append({
475
+ 'student_input': student_input,
476
+ 'session_context': context_str,
477
+ 'label': label_id,
478
+ 'intent_name': intent
479
+ })
480
+
481
+ df = pd.DataFrame(dataset)
482
+ df = df.sample(frac=1, random_state=42).reset_index(drop=True)
483
+
484
+ # Stratified 3-way split
485
+ train_dfs, val_dfs, test_dfs = [], [], []
486
+ for label_id in sorted(df['label'].unique()):
487
+ label_df = df[df['label'] == label_id].reset_index(drop=True)
488
+ n = len(label_df)
489
+ t1 = int(n * train_ratio)
490
+ t2 = int(n * (train_ratio + val_ratio))
491
+ train_dfs.append(label_df.iloc[:t1])
492
+ val_dfs.append(label_df.iloc[t1:t2])
493
+ test_dfs.append(label_df.iloc[t2:])
494
+
495
+ train_df = pd.concat(train_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
496
+ val_df = pd.concat(val_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
497
+ test_df = pd.concat(test_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
498
+
499
+ output_dir = 'data'
500
+ os.makedirs(output_dir, exist_ok=True)
501
+
502
+ train_df.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
503
+ val_df.to_csv(os.path.join(output_dir, 'val.csv'), index=False)
504
+ test_df.to_csv(os.path.join(output_dir, 'test.csv'), index=False)
505
+
506
+ print("[+] Data Generation Complete!")
507
+ print(f"Total: {len(df)} | Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
508
+ print(f"Train distribution:\n{train_df['label'].value_counts().sort_index().to_string()}")
509
+
510
+
511
+ if __name__ == '__main__':
512
+ build_dataset(num_samples_per_class=2000)