File size: 916 Bytes
a38611e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import io
import gzip
import json
import sys

import requests
from tqdm import tqdm

_DATA_URL_TRAIN = "https://huggingface.co/datasets/bertin-project/mc4-es-sampled/resolve/main/mc4-es-train-50M-{config}-shard-{index:04d}-of-{n_shards:04d}.json.gz"


def main(config="stepwise"):
    data_urls = [
        _DATA_URL_TRAIN.format(
            config=config,
            index=index + 1,
            n_shards=1024,
        )
        for index in range(1024)
    ]
    with open(f"mc4-es-train-50M-{config}.jsonl", "w") as f:
        for dara_url in tqdm(data_urls):
            response = requests.get(dara_url)
            bio = io.BytesIO(response.content)
            with gzip.open(bio, "rt", encoding="utf8") as g:
                for line in g:
                    json_line = json.loads(line.strip())
                    f.write(json.dumps(json_line) + "\n")


if __name__ == "__main__":
    main(sys.argv[1])