Spaces:
Runtime error
Runtime error
from collections import defaultdict | |
import os | |
from tqdm import tqdm | |
import argparse | |
import json | |
import pickle | |
parser = argparse.ArgumentParser(description='Extract term statistics on collection.') | |
parser.add_argument('--input', metavar='input file', help='input collection', | |
type=str, required=True) | |
parser.add_argument('--output', metavar='output file', help='output pickle', | |
type=str, required=True) | |
args = parser.parse_args() | |
print(args) | |
res = defaultdict(int) | |
total = 0 | |
for file_name in os.listdir(args.input): | |
file_path = os.path.join(args.input, file_name) | |
with open(file_path) as fin: | |
for line in tqdm(fin): | |
contents = json.loads(line)['contents'].split(' ') | |
for word in contents: | |
res[word] += 1 | |
total += 1 | |
res['TOTAL'] = total | |
with open(args.output, 'wb') as handle: | |
pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL) | |