Update README.md
Browse files
README.md
CHANGED
@@ -4,4 +4,126 @@ datasets:
|
|
4 |
- unicamp-dl/mmarco
|
5 |
language:
|
6 |
- de
|
7 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
- unicamp-dl/mmarco
|
5 |
language:
|
6 |
- de
|
7 |
+
---
|
8 |
+
|
9 |
+
|
10 |
+
# ColBERTv2-mmarco-de-0.1
|
11 |
+
|
12 |
+
This is a German ColBERT implementation based on [colbert-ir/colbertv2.0](https://huggingface.co/colbert-ir/colbertv2.0)
|
13 |
+
|
14 |
+
- Base Model: [dbmdz/bert-base-german-cased](https://huggingface.co/dbmdz/bert-base-german-cased)
|
15 |
+
- Training Data: [unicamp-dl/mmarco](https://huggingface.co/unicamp-dl/mMiniLM-L6-v2-mmarco-v2) --> 10Mio random sample
|
16 |
+
- Framework used for training [RAGatouille](https://github.com/bclavie/RAGatouille) Thanks a ton [@bclavie](https://huggingface.co/bclavie) !
|
17 |
+
|
18 |
+
|
19 |
+
As I'm limited on GPU Training did not go through all the way. "Only" 10 checkpoints were trained.
|
20 |
+
|
21 |
+
# Code
|
22 |
+
My code is probably a mess, but YOLO!
|
23 |
+
|
24 |
+
|
25 |
+
## data prep
|
26 |
+
```python
|
27 |
+
from datasets import load_dataset
|
28 |
+
from ragatouille import RAGTrainer
|
29 |
+
from tqdm import tqdm
|
30 |
+
import pickle
|
31 |
+
from concurrent.futures import ThreadPoolExecutor
|
32 |
+
from tqdm.notebook import tqdm
|
33 |
+
import concurrent
|
34 |
+
|
35 |
+
SAMPLE_SIZE = -1
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
def int_to_string(number):
|
40 |
+
if number < 0:
|
41 |
+
return "full"
|
42 |
+
elif number < 1000:
|
43 |
+
return str(number)
|
44 |
+
elif number < 1000000:
|
45 |
+
return f"{number // 1000}K"
|
46 |
+
elif number >= 1000000:
|
47 |
+
return f"{number // 1000000}M"
|
48 |
+
|
49 |
+
def process_chunk(chunk):
|
50 |
+
return [list(item) for item in zip(chunk["query"], chunk["positive"], chunk["negative"])]
|
51 |
+
|
52 |
+
def chunked_iterable(iterable, chunk_size):
|
53 |
+
"""Yield successive chunks from iterable."""
|
54 |
+
for i in range(0, len(iterable), chunk_size):
|
55 |
+
yield iterable[i:i + chunk_size]
|
56 |
+
|
57 |
+
def process_dataset_concurrently(dataset, chunksize=1000):
|
58 |
+
with ThreadPoolExecutor() as executor:
|
59 |
+
# Wrap the dataset with tqdm for real-time updates
|
60 |
+
wrapped_dataset = tqdm(chunked_iterable(dataset, chunksize), total=(len(dataset) + chunksize - 1) // chunksize)
|
61 |
+
# Submit each chunk to the executor
|
62 |
+
futures = [executor.submit(process_chunk, chunk) for chunk in wrapped_dataset]
|
63 |
+
results = []
|
64 |
+
for future in concurrent.futures.as_completed(futures):
|
65 |
+
results.extend(future.result())
|
66 |
+
return results
|
67 |
+
|
68 |
+
dataset = load_dataset('unicamp-dl/mmarco', 'german', trust_remote_code=True)
|
69 |
+
|
70 |
+
|
71 |
+
# Shuffle the dataset and seed for reproducibility if needed
|
72 |
+
shuffled_dataset = dataset['train'].shuffle(seed=42)
|
73 |
+
|
74 |
+
|
75 |
+
if SAMPLE_SIZE > 0:
|
76 |
+
sampled_dataset = shuffled_dataset.select(range(SAMPLE_SIZE))
|
77 |
+
else:
|
78 |
+
sampled_dataset = shuffled_dataset
|
79 |
+
|
80 |
+
|
81 |
+
triplets = process_dataset_concurrently(sampled_dataset, chunksize=10000)
|
82 |
+
trainer = RAGTrainer(model_name=f"ColBERT-mmacro-de-{int_to_string(SAMPLE_SIZE)}", pretrained_model_name="dbmdz/bert-base-german-cased", language_code="de",)
|
83 |
+
trainer.prepare_training_data(raw_data=triplets, mine_hard_negatives=False)
|
84 |
+
|
85 |
+
```
|
86 |
+
|
87 |
+
|
88 |
+
## Training
|
89 |
+
|
90 |
+
```python
|
91 |
+
from datasets import load_dataset
|
92 |
+
import os
|
93 |
+
from ragatouille import RAGTrainer
|
94 |
+
from tqdm import tqdm
|
95 |
+
import pickle
|
96 |
+
from concurrent.futures import ThreadPoolExecutor
|
97 |
+
from tqdm.notebook import tqdm
|
98 |
+
import concurrent
|
99 |
+
from pathlib import Path
|
100 |
+
|
101 |
+
|
102 |
+
def int_to_string(number):
|
103 |
+
if number < 1000:
|
104 |
+
return str(number)
|
105 |
+
elif number < 1000000:
|
106 |
+
return f"{number // 1000}K"
|
107 |
+
elif number >= 1000000:
|
108 |
+
return f"{number // 1000000}M"
|
109 |
+
|
110 |
+
|
111 |
+
|
112 |
+
SAMPLE_SIZE = 1000000
|
113 |
+
|
114 |
+
|
115 |
+
trainer = RAGTrainer(model_name=f"ColBERT-mmacro-de-{int_to_string(SAMPLE_SIZE)}", pretrained_model_name="dbmdz/bert-base-german-cased", language_code="de",)
|
116 |
+
|
117 |
+
trainer.data_dir = Path("/kaggle/input/mmarco-de-10m")
|
118 |
+
|
119 |
+
trainer.train(batch_size=32,
|
120 |
+
nbits=4, # How many bits will the trained model use when compressing indexes
|
121 |
+
maxsteps=500000, # Maximum steps hard stop
|
122 |
+
use_ib_negatives=True, # Use in-batch negative to calculate loss
|
123 |
+
dim=128, # How many dimensions per embedding. 128 is the default and works well.
|
124 |
+
learning_rate=5e-6, # Learning rate, small values ([3e-6,3e-5] work best if the base model is BERT-like, 5e-6 is often the sweet spot)
|
125 |
+
doc_maxlen=256, # Maximum document length. Because of how ColBERT works, smaller chunks (128-256) work very well.
|
126 |
+
use_relu=False, # Disable ReLU -- doesn't improve performance
|
127 |
+
warmup_steps="auto", # Defaults to 10%
|
128 |
+
)
|
129 |
+
```
|