Specific-Cognito commited on
Commit
3d8d9e4
Β·
verified Β·
1 Parent(s): 9dde6d0

Create prepare_embeddings_data.py

Browse files
Files changed (1) hide show
  1. prepare_embeddings_data.py +631 -0
prepare_embeddings_data.py ADDED
@@ -0,0 +1,631 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helion-V1-Embeddings Training Data Generator
3
+ Generate sentence pairs for training embeddings model
4
+ Optimized for semantic similarity and retrieval tasks
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ import random
10
+ from typing import List, Dict, Tuple
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(levelname)s - %(message)s'
17
+ )
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class EmbeddingsDataGenerator:
22
+ """Generate training data for embeddings model."""
23
+
24
+ def __init__(self, output_dir: str = "./embeddings_training_data"):
25
+ self.output_dir = Path(output_dir)
26
+ self.output_dir.mkdir(parents=True, exist_ok=True)
27
+
28
+ def generate_paraphrase_pairs(self) -> List[Dict]:
29
+ """
30
+ Generate paraphrase pairs (high similarity).
31
+ Score: 0.85-1.0
32
+ """
33
+ pairs = [
34
+ # Technical questions
35
+ {
36
+ "sentence1": "How do I install Python on Windows?",
37
+ "sentence2": "What's the process to set up Python on a Windows computer?",
38
+ "score": 0.95
39
+ },
40
+ {
41
+ "sentence1": "What is machine learning?",
42
+ "sentence2": "Can you explain machine learning to me?",
43
+ "score": 0.92
44
+ },
45
+ {
46
+ "sentence1": "How to fix a bug in my code?",
47
+ "sentence2": "What's the best way to debug my program?",
48
+ "score": 0.88
49
+ },
50
+ {
51
+ "sentence1": "Reset password instructions",
52
+ "sentence2": "How do I reset my password?",
53
+ "score": 0.93
54
+ },
55
+ {
56
+ "sentence1": "Database connection error",
57
+ "sentence2": "Can't connect to the database",
58
+ "score": 0.90
59
+ },
60
+
61
+ # General knowledge
62
+ {
63
+ "sentence1": "What is the capital of France?",
64
+ "sentence2": "Tell me the capital city of France",
65
+ "score": 0.96
66
+ },
67
+ {
68
+ "sentence1": "Best restaurants in New York",
69
+ "sentence2": "Where to eat in New York City",
70
+ "score": 0.89
71
+ },
72
+ {
73
+ "sentence1": "Weather forecast for tomorrow",
74
+ "sentence2": "What will the weather be like tomorrow?",
75
+ "score": 0.91
76
+ },
77
+ {
78
+ "sentence1": "How to learn a new language",
79
+ "sentence2": "Tips for learning foreign languages",
80
+ "score": 0.87
81
+ },
82
+ {
83
+ "sentence1": "Symptoms of the flu",
84
+ "sentence2": "What are flu symptoms?",
85
+ "score": 0.94
86
+ },
87
+
88
+ # Product/service queries
89
+ {
90
+ "sentence1": "How to cancel my subscription",
91
+ "sentence2": "Steps to unsubscribe from the service",
92
+ "score": 0.90
93
+ },
94
+ {
95
+ "sentence1": "Return policy for products",
96
+ "sentence2": "How do I return an item?",
97
+ "score": 0.86
98
+ },
99
+ {
100
+ "sentence1": "Customer support contact",
101
+ "sentence2": "How to reach customer service",
102
+ "score": 0.92
103
+ },
104
+ {
105
+ "sentence1": "Shipping tracking information",
106
+ "sentence2": "Where is my order?",
107
+ "score": 0.85
108
+ },
109
+ {
110
+ "sentence1": "Update payment method",
111
+ "sentence2": "Change my credit card information",
112
+ "score": 0.88
113
+ },
114
+ ]
115
+
116
+ logger.info(f"Generated {len(pairs)} paraphrase pairs")
117
+ return pairs
118
+
119
+ def generate_similar_pairs(self) -> List[Dict]:
120
+ """
121
+ Generate semantically similar pairs (medium-high similarity).
122
+ Score: 0.60-0.85
123
+ """
124
+ pairs = [
125
+ # Related concepts
126
+ {
127
+ "sentence1": "Machine learning algorithms",
128
+ "sentence2": "Neural network architectures",
129
+ "score": 0.75
130
+ },
131
+ {
132
+ "sentence1": "Python programming language",
133
+ "sentence2": "JavaScript coding tutorial",
134
+ "score": 0.68
135
+ },
136
+ {
137
+ "sentence1": "Data science career path",
138
+ "sentence2": "Becoming a data analyst",
139
+ "score": 0.72
140
+ },
141
+ {
142
+ "sentence1": "Cloud computing services",
143
+ "sentence2": "AWS infrastructure guide",
144
+ "score": 0.70
145
+ },
146
+ {
147
+ "sentence1": "Web development frameworks",
148
+ "sentence2": "React and Vue.js comparison",
149
+ "score": 0.74
150
+ },
151
+
152
+ # Related questions
153
+ {
154
+ "sentence1": "How to lose weight?",
155
+ "sentence2": "Healthy eating habits",
156
+ "score": 0.65
157
+ },
158
+ {
159
+ "sentence1": "Best laptops for programming",
160
+ "sentence2": "Computer hardware for developers",
161
+ "score": 0.71
162
+ },
163
+ {
164
+ "sentence1": "Learning guitar for beginners",
165
+ "sentence2": "Music theory basics",
166
+ "score": 0.62
167
+ },
168
+ {
169
+ "sentence1": "Travel tips for Europe",
170
+ "sentence2": "Budget travel guide",
171
+ "score": 0.67
172
+ },
173
+ {
174
+ "sentence1": "Home workout routines",
175
+ "sentence2": "Fitness exercises without equipment",
176
+ "score": 0.73
177
+ },
178
+
179
+ # Professional context
180
+ {
181
+ "sentence1": "Project management best practices",
182
+ "sentence2": "Agile methodology guide",
183
+ "score": 0.69
184
+ },
185
+ {
186
+ "sentence1": "Resume writing tips",
187
+ "sentence2": "Job interview preparation",
188
+ "score": 0.64
189
+ },
190
+ {
191
+ "sentence1": "Team collaboration tools",
192
+ "sentence2": "Remote work software solutions",
193
+ "score": 0.72
194
+ },
195
+ {
196
+ "sentence1": "Time management techniques",
197
+ "sentence2": "Productivity improvement strategies",
198
+ "score": 0.76
199
+ },
200
+ {
201
+ "sentence1": "Business communication skills",
202
+ "sentence2": "Professional email etiquette",
203
+ "score": 0.66
204
+ },
205
+ ]
206
+
207
+ logger.info(f"Generated {len(pairs)} similar pairs")
208
+ return pairs
209
+
210
+ def generate_dissimilar_pairs(self) -> List[Dict]:
211
+ """
212
+ Generate unrelated pairs (low similarity).
213
+ Score: 0.0-0.30
214
+ """
215
+ pairs = [
216
+ # Completely unrelated
217
+ {
218
+ "sentence1": "How to bake chocolate cake",
219
+ "sentence2": "Installing Linux operating system",
220
+ "score": 0.05
221
+ },
222
+ {
223
+ "sentence1": "Football match results",
224
+ "sentence2": "Quantum physics equations",
225
+ "score": 0.02
226
+ },
227
+ {
228
+ "sentence1": "Dog training tips",
229
+ "sentence2": "Stock market analysis",
230
+ "score": 0.08
231
+ },
232
+ {
233
+ "sentence1": "Car repair manual",
234
+ "sentence2": "Ancient Roman history",
235
+ "score": 0.04
236
+ },
237
+ {
238
+ "sentence1": "Gardening for beginners",
239
+ "sentence2": "Cryptocurrency trading strategies",
240
+ "score": 0.06
241
+ },
242
+
243
+ # Different domains
244
+ {
245
+ "sentence1": "Piano lessons online",
246
+ "sentence2": "Chemical engineering degree",
247
+ "score": 0.10
248
+ },
249
+ {
250
+ "sentence1": "Knitting patterns",
251
+ "sentence2": "Cybersecurity threats",
252
+ "score": 0.03
253
+ },
254
+ {
255
+ "sentence1": "Mediterranean diet recipes",
256
+ "sentence2": "Smartphone app development",
257
+ "score": 0.07
258
+ },
259
+ {
260
+ "sentence1": "Yoga poses for flexibility",
261
+ "sentence2": "Legal contract templates",
262
+ "score": 0.05
263
+ },
264
+ {
265
+ "sentence1": "Movie reviews 2024",
266
+ "sentence2": "Database optimization techniques",
267
+ "score": 0.09
268
+ },
269
+
270
+ # Topic mismatch
271
+ {
272
+ "sentence1": "Wedding planning checklist",
273
+ "sentence2": "Machine learning deployment",
274
+ "score": 0.04
275
+ },
276
+ {
277
+ "sentence1": "Child development stages",
278
+ "sentence2": "Network security protocols",
279
+ "score": 0.06
280
+ },
281
+ {
282
+ "sentence1": "Photography lighting techniques",
283
+ "sentence2": "Tax filing requirements",
284
+ "score": 0.08
285
+ },
286
+ {
287
+ "sentence1": "Fashion trends 2024",
288
+ "sentence2": "Docker container orchestration",
289
+ "score": 0.02
290
+ },
291
+ {
292
+ "sentence1": "Scuba diving certification",
293
+ "sentence2": "Financial portfolio management",
294
+ "score": 0.07
295
+ },
296
+ ]
297
+
298
+ logger.info(f"Generated {len(pairs)} dissimilar pairs")
299
+ return pairs
300
+
301
+ def generate_question_answer_pairs(self) -> List[Dict]:
302
+ """
303
+ Generate question-answer pairs for retrieval training.
304
+ Score: 0.80-0.95
305
+ """
306
+ pairs = [
307
+ {
308
+ "sentence1": "What is Python?",
309
+ "sentence2": "Python is a high-level programming language known for its simplicity and versatility.",
310
+ "score": 0.88
311
+ },
312
+ {
313
+ "sentence1": "How does HTTP work?",
314
+ "sentence2": "HTTP is a protocol that enables communication between web browsers and servers.",
315
+ "score": 0.85
316
+ },
317
+ {
318
+ "sentence1": "What is artificial intelligence?",
319
+ "sentence2": "AI is the simulation of human intelligence by machines and computer systems.",
320
+ "score": 0.90
321
+ },
322
+ {
323
+ "sentence1": "Define cloud computing",
324
+ "sentence2": "Cloud computing delivers computing services over the internet including storage and processing.",
325
+ "score": 0.87
326
+ },
327
+ {
328
+ "sentence1": "What is a database?",
329
+ "sentence2": "A database is an organized collection of structured information stored electronically.",
330
+ "score": 0.89
331
+ },
332
+ {
333
+ "sentence1": "Explain REST API",
334
+ "sentence2": "REST API is an architectural style for building web services using HTTP requests.",
335
+ "score": 0.84
336
+ },
337
+ {
338
+ "sentence1": "What is version control?",
339
+ "sentence2": "Version control is a system that tracks changes to files over time.",
340
+ "score": 0.86
341
+ },
342
+ {
343
+ "sentence1": "Define responsive design",
344
+ "sentence2": "Responsive design ensures websites work well on all devices and screen sizes.",
345
+ "score": 0.88
346
+ },
347
+ {
348
+ "sentence1": "What is encryption?",
349
+ "sentence2": "Encryption is the process of encoding information to prevent unauthorized access.",
350
+ "score": 0.91
351
+ },
352
+ {
353
+ "sentence1": "Explain agile methodology",
354
+ "sentence2": "Agile is an iterative approach to project management focused on flexibility.",
355
+ "score": 0.83
356
+ },
357
+ ]
358
+
359
+ logger.info(f"Generated {len(pairs)} question-answer pairs")
360
+ return pairs
361
+
362
+ def generate_domain_specific_pairs(self) -> List[Dict]:
363
+ """
364
+ Generate domain-specific sentence pairs.
365
+ Score: Various
366
+ """
367
+ pairs = [
368
+ # Programming
369
+ {
370
+ "sentence1": "Python list comprehension",
371
+ "sentence2": "Creating lists in Python efficiently",
372
+ "score": 0.86
373
+ },
374
+ {
375
+ "sentence1": "Git merge conflicts",
376
+ "sentence2": "Resolving version control conflicts",
377
+ "score": 0.84
378
+ },
379
+ {
380
+ "sentence1": "React component lifecycle",
381
+ "sentence2": "Understanding React hooks",
382
+ "score": 0.72
383
+ },
384
+
385
+ # Healthcare
386
+ {
387
+ "sentence1": "Blood pressure medication",
388
+ "sentence2": "Treating hypertension",
389
+ "score": 0.78
390
+ },
391
+ {
392
+ "sentence1": "Physical therapy exercises",
393
+ "sentence2": "Rehabilitation program",
394
+ "score": 0.80
395
+ },
396
+
397
+ # Finance
398
+ {
399
+ "sentence1": "Investment portfolio diversification",
400
+ "sentence2": "Managing financial risk",
401
+ "score": 0.75
402
+ },
403
+ {
404
+ "sentence1": "Mortgage interest rates",
405
+ "sentence2": "Home loan options",
406
+ "score": 0.82
407
+ },
408
+
409
+ # Education
410
+ {
411
+ "sentence1": "Online course platforms",
412
+ "sentence2": "E-learning systems",
413
+ "score": 0.88
414
+ },
415
+ {
416
+ "sentence1": "Study techniques for exams",
417
+ "sentence2": "Test preparation strategies",
418
+ "score": 0.85
419
+ },
420
+
421
+ # E-commerce
422
+ {
423
+ "sentence1": "Product recommendation system",
424
+ "sentence2": "Personalized shopping suggestions",
425
+ "score": 0.83
426
+ },
427
+ {
428
+ "sentence1": "Shopping cart abandonment",
429
+ "sentence2": "Incomplete purchase behavior",
430
+ "score": 0.86
431
+ },
432
+ ]
433
+
434
+ logger.info(f"Generated {len(pairs)} domain-specific pairs")
435
+ return pairs
436
+
437
+ def format_for_training(self, pairs: List[Dict]) -> List[Dict]:
438
+ """
439
+ Format sentence pairs for training.
440
+
441
+ Args:
442
+ pairs: List of sentence pair dictionaries
443
+
444
+ Returns:
445
+ Formatted training examples
446
+ """
447
+ formatted = []
448
+
449
+ for pair in pairs:
450
+ formatted.append({
451
+ "sentence1": pair["sentence1"],
452
+ "sentence2": pair["sentence2"],
453
+ "score": pair["score"]
454
+ })
455
+
456
+ return formatted
457
+
458
+ def create_contrastive_examples(self, pairs: List[Dict]) -> List[Dict]:
459
+ """
460
+ Create contrastive examples (anchor, positive, negative).
461
+
462
+ Args:
463
+ pairs: Sentence pairs with scores
464
+
465
+ Returns:
466
+ Triplet examples
467
+ """
468
+ contrastive = []
469
+
470
+ high_sim = [p for p in pairs if p["score"] >= 0.80]
471
+ low_sim = [p for p in pairs if p["score"] <= 0.30]
472
+
473
+ for positive_pair in high_sim[:20]: # Take first 20
474
+ # Select random negative
475
+ if low_sim:
476
+ negative_pair = random.choice(low_sim)
477
+ contrastive.append({
478
+ "anchor": positive_pair["sentence1"],
479
+ "positive": positive_pair["sentence2"],
480
+ "negative": negative_pair["sentence2"]
481
+ })
482
+
483
+ logger.info(f"Created {len(contrastive)} contrastive examples")
484
+ return contrastive
485
+
486
+ def save_data(self, data: List[Dict], filename: str, format: str = "json"):
487
+ """Save training data to file."""
488
+ filepath = self.output_dir / filename
489
+
490
+ if format == "json":
491
+ with open(filepath, 'w', encoding='utf-8') as f:
492
+ json.dump(data, f, indent=2, ensure_ascii=False)
493
+ elif format == "jsonl":
494
+ with open(filepath, 'w', encoding='utf-8') as f:
495
+ for item in data:
496
+ f.write(json.dumps(item, ensure_ascii=False) + '\n')
497
+
498
+ logger.info(f"Saved {len(data)} examples to {filepath}")
499
+
500
+ def generate_full_dataset(self, format: str = "json") -> str:
501
+ """
502
+ Generate complete embeddings training dataset.
503
+
504
+ Args:
505
+ format: Output format ('json' or 'jsonl')
506
+
507
+ Returns:
508
+ Output directory path
509
+ """
510
+ logger.info("Generating embeddings training dataset...")
511
+
512
+ # Collect all pairs
513
+ all_pairs = []
514
+
515
+ paraphrase_pairs = self.generate_paraphrase_pairs()
516
+ all_pairs.extend(paraphrase_pairs)
517
+
518
+ similar_pairs = self.generate_similar_pairs()
519
+ all_pairs.extend(similar_pairs)
520
+
521
+ dissimilar_pairs = self.generate_dissimilar_pairs()
522
+ all_pairs.extend(dissimilar_pairs)
523
+
524
+ qa_pairs = self.generate_question_answer_pairs()
525
+ all_pairs.extend(qa_pairs)
526
+
527
+ domain_pairs = self.generate_domain_specific_pairs()
528
+ all_pairs.extend(domain_pairs)
529
+
530
+ # Shuffle
531
+ random.shuffle(all_pairs)
532
+
533
+ # Split train/validation
534
+ split_idx = int(len(all_pairs) * 0.9)
535
+ train_pairs = all_pairs[:split_idx]
536
+ val_pairs = all_pairs[split_idx:]
537
+
538
+ logger.info(f"Train: {len(train_pairs)} pairs")
539
+ logger.info(f"Validation: {len(val_pairs)} pairs")
540
+
541
+ # Format data
542
+ train_data = self.format_for_training(train_pairs)
543
+ val_data = self.format_for_training(val_pairs)
544
+
545
+ # Save sentence pair format
546
+ self.save_data(train_data, f"train_pairs.{format}", format)
547
+ self.save_data(val_data, f"validation_pairs.{format}", format)
548
+
549
+ # Create contrastive examples
550
+ contrastive_data = self.create_contrastive_examples(all_pairs)
551
+ self.save_data(contrastive_data, f"contrastive_triplets.{format}", format)
552
+
553
+ # Generate statistics
554
+ stats = {
555
+ "total_pairs": len(all_pairs),
556
+ "train_size": len(train_pairs),
557
+ "validation_size": len(val_pairs),
558
+ "contrastive_triplets": len(contrastive_data),
559
+ "paraphrase_pairs": len(paraphrase_pairs),
560
+ "similar_pairs": len(similar_pairs),
561
+ "dissimilar_pairs": len(dissimilar_pairs),
562
+ "qa_pairs": len(qa_pairs),
563
+ "domain_pairs": len(domain_pairs),
564
+ "score_distribution": {
565
+ "high (0.8-1.0)": len([p for p in all_pairs if p["score"] >= 0.8]),
566
+ "medium (0.5-0.8)": len([p for p in all_pairs if 0.5 <= p["score"] < 0.8]),
567
+ "low (0.0-0.5)": len([p for p in all_pairs if p["score"] < 0.5])
568
+ },
569
+ "generated_at": datetime.now().isoformat(),
570
+ "format": format
571
+ }
572
+
573
+ self.save_data(stats, "embeddings_dataset_stats.json", "json")
574
+
575
+ logger.info("="*60)
576
+ logger.info("βœ… Embeddings dataset generation complete!")
577
+ logger.info(f"Total pairs: {len(all_pairs)}")
578
+ logger.info(f"Output directory: {self.output_dir}")
579
+ logger.info("="*60)
580
+
581
+ return str(self.output_dir)
582
+
583
+
584
+ def main():
585
+ """Main function for data generation."""
586
+ import argparse
587
+
588
+ parser = argparse.ArgumentParser(
589
+ description="Generate training data for Helion-V1-Embeddings"
590
+ )
591
+ parser.add_argument(
592
+ "--output-dir",
593
+ default="./embeddings_training_data",
594
+ help="Output directory for training data"
595
+ )
596
+ parser.add_argument(
597
+ "--format",
598
+ choices=["json", "jsonl"],
599
+ default="json",
600
+ help="Output format"
601
+ )
602
+
603
+ args = parser.parse_args()
604
+
605
+ # Generate dataset
606
+ generator = EmbeddingsDataGenerator(output_dir=args.output_dir)
607
+ output_path = generator.generate_full_dataset(format=args.format)
608
+
609
+ print("\n" + "="*60)
610
+ print("🎯 Embeddings Training Data Ready!")
611
+ print("="*60)
612
+ print(f"πŸ“ Location: {output_path}")
613
+ print(f"πŸ“Š Format: {args.format}")
614
+ print("\nπŸ“„ Files created:")
615
+ print(f" β€’ train_pairs.{args.format} - Training sentence pairs")
616
+ print(f" β€’ validation_pairs.{args.format} - Validation pairs")
617
+ print(f" β€’ contrastive_triplets.{args.format} - Triplet examples")
618
+ print(" β€’ embeddings_dataset_stats.json - Dataset statistics")
619
+ print("\nπŸ’‘ Training data includes:")
620
+ print(" β€’ Paraphrase pairs (high similarity)")
621
+ print(" β€’ Similar concept pairs (medium similarity)")
622
+ print(" β€’ Dissimilar pairs (low similarity)")
623
+ print(" β€’ Question-answer pairs")
624
+ print(" β€’ Domain-specific examples")
625
+ print("\nπŸš€ Next step:")
626
+ print(f" python train_embeddings.py --data-file {output_path}/train_pairs.{args.format}")
627
+ print("="*60)
628
+
629
+
630
+ if __name__ == "__main__":
631
+ main()