|
|
|
|
|
|
|
|
|
|
|
""" |
|
Split a large file into shards while respecting document boundaries. Documents |
|
should be separated by a single empty line. |
|
""" |
|
|
|
import argparse |
|
import contextlib |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("input") |
|
parser.add_argument("--num-shards", type=int) |
|
args = parser.parse_args() |
|
|
|
assert args.num_shards is not None and args.num_shards > 1 |
|
|
|
with open(args.input, "r", encoding="utf-8") as h: |
|
with contextlib.ExitStack() as stack: |
|
outputs = [ |
|
stack.enter_context( |
|
open(args.input + ".shard" + str(i), "w", encoding="utf-8") |
|
) |
|
for i in range(args.num_shards) |
|
] |
|
|
|
doc = [] |
|
first_doc = [True] * args.num_shards |
|
|
|
def output_doc(i): |
|
if not first_doc[i]: |
|
outputs[i].write("\n") |
|
first_doc[i] = False |
|
for line in doc: |
|
outputs[i].write(line) |
|
doc.clear() |
|
|
|
num_docs = 0 |
|
for line in h: |
|
if line.strip() == "": |
|
output_doc(num_docs % args.num_shards) |
|
num_docs += 1 |
|
else: |
|
doc.append(line) |
|
output_doc(num_docs % args.num_shards) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|