Spaces:
Runtime error
Runtime error
# | |
# Pyserini: Reproducible IR research with sparse and dense representations | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
# Starting point for writing this script | |
# https://stackoverflow.com/questions/13129618/histogram-values-of-a-pandas-series | |
import argparse | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import sys | |
# Use Pyserini in this repo (as opposed to pip install) | |
sys.path.insert(0, './') | |
from pyserini.search.lucene import LuceneSearcher | |
from tqdm import tqdm | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--index", type=str, help='Index Location.', required=True) | |
parser.add_argument("--name", type=str, help='Name of collection.') | |
parser.add_argument("--max", type=int, help='Max number of documents to analyze.') | |
parser.add_argument("--bin-min", type=int, help='Minimum bin.', default=0) | |
parser.add_argument("--bin-max", type=int, help='Maximum bin.', default=2000) | |
parser.add_argument("--bin-width", type=int, help='Width of each bin.', default=50) | |
parser.add_argument("--plot", type=str, help='Output file of histogram PDF.') | |
parser.add_argument("--output", type=str, help='Prefix of raw count and bin data file.') | |
args = parser.parse_args() | |
plt.switch_backend('agg') | |
searcher = LuceneSearcher(args.index) | |
# Determine how many documents to iterate over: | |
if args.max: | |
num_docs = args.max if args.max < searcher.num_docs else searcher.num_docs | |
else: | |
num_docs = searcher.num_docs | |
print(f'Computing lengths for {num_docs} from {args.index}') | |
doclengths = [] | |
for i in tqdm(range(num_docs)): | |
doclengths.append(len(searcher.doc(i).raw().split())) | |
doclengths = np.asarray(doclengths) | |
# Compute bins: | |
bins = np.arange(args.bin_min, args.bin_max + args.bin_width, args.bin_width) | |
# If user wants raw output of counts: | |
if args.output: | |
counts, bins = np.histogram(doclengths, bins=bins) | |
np.savetxt(f'{args.output}-counts.txt', counts, fmt="%s") | |
np.savetxt(f'{args.output}-bins.txt', counts, fmt="%s") | |
# If user wants plot: | |
if args.plot: | |
_ = plt.hist(doclengths, bins=bins) | |
title = f'Document Lengths: {args.name}' if args.name else 'Document Lengths' | |
plt.title(title) | |
plt.xlabel('Length') | |
plt.ylabel('Count') | |
plt.savefig(args.plot, bbox_inches='tight', format='pdf') | |
# Print summary statistics | |
print(f'\n# Summary statistics') | |
print(f'min: {np.amin(doclengths)}') | |
print(f'max: {np.amax(doclengths)}') | |
print(f'median: {np.median(doclengths)}') | |
print(f'mean: {np.mean(doclengths)}') | |