domci commited on
Commit
da28189
1 Parent(s): 723e9d1

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +123 -1
README.md CHANGED
@@ -4,4 +4,126 @@ datasets:
4
  - unicamp-dl/mmarco
5
  language:
6
  - de
7
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  - unicamp-dl/mmarco
5
  language:
6
  - de
7
+ ---
8
+
9
+
10
+ # ColBERTv2-mmarco-de-0.1
11
+
12
+ This is a German ColBERT implementation based on [colbert-ir/colbertv2.0](https://huggingface.co/colbert-ir/colbertv2.0)
13
+
14
+ - Base Model: [dbmdz/bert-base-german-cased](https://huggingface.co/dbmdz/bert-base-german-cased)
15
+ - Training Data: [unicamp-dl/mmarco](https://huggingface.co/unicamp-dl/mMiniLM-L6-v2-mmarco-v2) --> 10Mio random sample
16
+ - Framework used for training [RAGatouille](https://github.com/bclavie/RAGatouille) Thanks a ton [@bclavie](https://huggingface.co/bclavie) !
17
+
18
+
19
+ As I'm limited on GPU Training did not go through all the way. "Only" 10 checkpoints were trained.
20
+
21
+ # Code
22
+ My code is probably a mess, but YOLO!
23
+
24
+
25
+ ## data prep
26
+ ```python
27
+ from datasets import load_dataset
28
+ from ragatouille import RAGTrainer
29
+ from tqdm import tqdm
30
+ import pickle
31
+ from concurrent.futures import ThreadPoolExecutor
32
+ from tqdm.notebook import tqdm
33
+ import concurrent
34
+
35
+ SAMPLE_SIZE = -1
36
+
37
+
38
+
39
+ def int_to_string(number):
40
+ if number < 0:
41
+ return "full"
42
+ elif number < 1000:
43
+ return str(number)
44
+ elif number < 1000000:
45
+ return f"{number // 1000}K"
46
+ elif number >= 1000000:
47
+ return f"{number // 1000000}M"
48
+
49
+ def process_chunk(chunk):
50
+ return [list(item) for item in zip(chunk["query"], chunk["positive"], chunk["negative"])]
51
+
52
+ def chunked_iterable(iterable, chunk_size):
53
+ """Yield successive chunks from iterable."""
54
+ for i in range(0, len(iterable), chunk_size):
55
+ yield iterable[i:i + chunk_size]
56
+
57
+ def process_dataset_concurrently(dataset, chunksize=1000):
58
+ with ThreadPoolExecutor() as executor:
59
+ # Wrap the dataset with tqdm for real-time updates
60
+ wrapped_dataset = tqdm(chunked_iterable(dataset, chunksize), total=(len(dataset) + chunksize - 1) // chunksize)
61
+ # Submit each chunk to the executor
62
+ futures = [executor.submit(process_chunk, chunk) for chunk in wrapped_dataset]
63
+ results = []
64
+ for future in concurrent.futures.as_completed(futures):
65
+ results.extend(future.result())
66
+ return results
67
+
68
+ dataset = load_dataset('unicamp-dl/mmarco', 'german', trust_remote_code=True)
69
+
70
+
71
+ # Shuffle the dataset and seed for reproducibility if needed
72
+ shuffled_dataset = dataset['train'].shuffle(seed=42)
73
+
74
+
75
+ if SAMPLE_SIZE > 0:
76
+ sampled_dataset = shuffled_dataset.select(range(SAMPLE_SIZE))
77
+ else:
78
+ sampled_dataset = shuffled_dataset
79
+
80
+
81
+ triplets = process_dataset_concurrently(sampled_dataset, chunksize=10000)
82
+ trainer = RAGTrainer(model_name=f"ColBERT-mmacro-de-{int_to_string(SAMPLE_SIZE)}", pretrained_model_name="dbmdz/bert-base-german-cased", language_code="de",)
83
+ trainer.prepare_training_data(raw_data=triplets, mine_hard_negatives=False)
84
+
85
+ ```
86
+
87
+
88
+ ## Training
89
+
90
+ ```python
91
+ from datasets import load_dataset
92
+ import os
93
+ from ragatouille import RAGTrainer
94
+ from tqdm import tqdm
95
+ import pickle
96
+ from concurrent.futures import ThreadPoolExecutor
97
+ from tqdm.notebook import tqdm
98
+ import concurrent
99
+ from pathlib import Path
100
+
101
+
102
+ def int_to_string(number):
103
+ if number < 1000:
104
+ return str(number)
105
+ elif number < 1000000:
106
+ return f"{number // 1000}K"
107
+ elif number >= 1000000:
108
+ return f"{number // 1000000}M"
109
+
110
+
111
+
112
+ SAMPLE_SIZE = 1000000
113
+
114
+
115
+ trainer = RAGTrainer(model_name=f"ColBERT-mmacro-de-{int_to_string(SAMPLE_SIZE)}", pretrained_model_name="dbmdz/bert-base-german-cased", language_code="de",)
116
+
117
+ trainer.data_dir = Path("/kaggle/input/mmarco-de-10m")
118
+
119
+ trainer.train(batch_size=32,
120
+ nbits=4, # How many bits will the trained model use when compressing indexes
121
+ maxsteps=500000, # Maximum steps hard stop
122
+ use_ib_negatives=True, # Use in-batch negative to calculate loss
123
+ dim=128, # How many dimensions per embedding. 128 is the default and works well.
124
+ learning_rate=5e-6, # Learning rate, small values ([3e-6,3e-5] work best if the base model is BERT-like, 5e-6 is often the sweet spot)
125
+ doc_maxlen=256, # Maximum document length. Because of how ColBERT works, smaller chunks (128-256) work very well.
126
+ use_relu=False, # Disable ReLU -- doesn't improve performance
127
+ warmup_steps="auto", # Defaults to 10%
128
+ )
129
+ ```