from sys import argv filename = argv[1] num_line = argv[2] output_dir = argv[3] lines = open(filename).read().strip().split('\n') ckpt = 0 shard_lines = [] for i, line in enumerate(lines): if line == '' and (i-ckpt)>=int(num_line): shard_lines.append(lines[ckpt:i+1]) ckpt = i+1 if ckpt < len(lines) - 1: shard_lines.append(lines[ckpt:]) for i, doc in enumerate(shard_lines): with open(f'{output_dir}/{i:06}.txt', 'w') as f: print('\n'.join(doc), file=f, end='\n')