import io import gzip import json import sys import requests from tqdm import tqdm _DATA_URL_TRAIN = "https://huggingface.co/datasets/bertin-project/mc4-es-sampled/resolve/main/mc4-es-train-50M-{config}-shard-{index:04d}-of-{n_shards:04d}.json.gz" def main(config="stepwise"): data_urls = [ _DATA_URL_TRAIN.format( config=config, index=index + 1, n_shards=1024, ) for index in range(1024) ] with open(f"mc4-es-train-50M-{config}.jsonl", "w") as f: for dara_url in tqdm(data_urls): response = requests.get(dara_url) bio = io.BytesIO(response.content) with gzip.open(bio, "rt", encoding="utf8") as g: for line in g: json_line = json.loads(line.strip()) f.write(json.dumps(json_line) + "\n") if __name__ == "__main__": main(sys.argv[1])