LaughLM / scripts /build_shard.py
dignity045's picture
Duplicate from Dhiraj45/LaughLM
9639af0
raw
history blame contribute delete
630 Bytes
from LaughLM.data.dataset import DomainSampler
from LaughLM.data.tokenizer import LaughTokenizer
from LaughLM.data.shard_writer import BinaryShardWriter
def main():
tokenizer = LaughTokenizer("tokenizer.json")
sampler = DomainSampler(
sources=[
{"name": "HuggingFaceFW/fineweb-edu", "weight": 0.4},
{"name": "bigcode/starcoderdata", "weight": 0.2},
]
)
writer = BinaryShardWriter(
tokenizer=tokenizer,
output_path="train_000.bin",
shard_tokens=5_000_000_000,
)
writer.build_shard(iter(sampler))
if __name__ == "__main__":
main()