#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Count the number of documents and average number of lines and tokens per document in a large file. Documents should be separated by a single empty line. """ import argparse import gzip import sys import numpy as np def main(): parser = argparse.ArgumentParser() parser.add_argument("input") parser.add_argument("--gzip", action="store_true") args = parser.parse_args() def gopen(): if args.gzip: return gzip.open(args.input, "r") else: return open(args.input, "r", encoding="utf-8") num_lines = [] num_toks = [] with gopen() as h: num_docs = 1 num_lines_in_doc = 0 num_toks_in_doc = 0 for i, line in enumerate(h): if len(line.strip()) == 0: # empty line indicates new document num_docs += 1 num_lines.append(num_lines_in_doc) num_toks.append(num_toks_in_doc) num_lines_in_doc = 0 num_toks_in_doc = 0 else: num_lines_in_doc += 1 num_toks_in_doc += len(line.rstrip().split()) if i % 1000000 == 0: print(i, file=sys.stderr, end="", flush=True) elif i % 100000 == 0: print(".", file=sys.stderr, end="", flush=True) print(file=sys.stderr, flush=True) print("found {} docs".format(num_docs)) print("average num lines per doc: {}".format(np.mean(num_lines))) print("average num toks per doc: {}".format(np.mean(num_toks))) if __name__ == "__main__": main()