File size: 1,616 Bytes
26fd00c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Split a large file into shards while respecting document boundaries. Documents
should be separated by a single empty line.
"""

import argparse
import contextlib


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input")
    parser.add_argument("--num-shards", type=int)
    args = parser.parse_args()

    assert args.num_shards is not None and args.num_shards > 1

    with open(args.input, "r", encoding="utf-8") as h:
        with contextlib.ExitStack() as stack:
            outputs = [
                stack.enter_context(
                    open(args.input + ".shard" + str(i), "w", encoding="utf-8")
                )
                for i in range(args.num_shards)
            ]

            doc = []
            first_doc = [True] * args.num_shards

            def output_doc(i):
                if not first_doc[i]:
                    outputs[i].write("\n")
                first_doc[i] = False
                for line in doc:
                    outputs[i].write(line)
                doc.clear()

            num_docs = 0
            for line in h:
                if line.strip() == "":  # empty line indicates new document
                    output_doc(num_docs % args.num_shards)
                    num_docs += 1
                else:
                    doc.append(line)
            output_doc(num_docs % args.num_shards)


if __name__ == "__main__":
    main()