Spaces:
Runtime error
Runtime error
# | |
# Pyserini: Reproducible IR research with sparse and dense representations | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
import sys | |
import argparse | |
import pandas as pd | |
sys.path.insert(0, './') | |
from pyserini.collection import Collection, Cord19Article | |
def main(path, coef, top): | |
documents_with_text = dict() | |
documents_without_text = dict() | |
documents_zero_abs = dict() | |
documents_empty_body = dict() | |
cnt = 0 | |
collection = Collection('Cord19AbstractCollection', path) | |
articles = collection.__next__() | |
# iterate through raw collection | |
for (i, d) in enumerate(articles): | |
article = Cord19Article(d.raw) | |
# documents with empty abstract | |
if len(article.abstract()) == 0: | |
documents_zero_abs.setdefault(article.cord_uid(), []) | |
documents_zero_abs[article.cord_uid()].append(article.metadata()["doi"]) | |
else: | |
# document does not have text | |
if not article.is_full_text(): | |
documents_without_text.setdefault(article.cord_uid(), []) | |
documents_without_text[article.cord_uid()].append(article.metadata()["doi"]) | |
documents_without_text[article.cord_uid()].append(len(article.title())) | |
documents_without_text[article.cord_uid()].append(len(article.abstract())) | |
# document whose text body is empty | |
elif len(article.body()) == 0: | |
documents_empty_body.setdefault(article.cord_uid(), []) | |
documents_empty_body[article.cord_uid()].append(article.metadata()["doi"]) | |
# normal document and we save for analysis later | |
else: | |
num_paragraph = len(article.body()) | |
title_len = len(article.title()) * num_paragraph | |
abstract_len = len(article.abstract()) * num_paragraph | |
documents_with_text.setdefault(article.cord_uid(), []) | |
documents_with_text[article.cord_uid()].append(article.metadata()["doi"]) | |
documents_with_text[article.cord_uid()].append(title_len) | |
documents_with_text[article.cord_uid()].append(abstract_len) | |
documents_with_text[article.cord_uid()].append(num_paragraph) | |
cnt = cnt + 1 | |
if cnt % 1000 == 0: | |
print(f'{cnt} articles read...') | |
documents_with_text_df = pd.DataFrame([([k] + v) for k, v in documents_with_text.items()]) | |
documents_with_text_df = documents_with_text_df.loc[:, [0, 1, 2, 3, 4]] | |
documents_with_text_df.columns = ['docid', 'DOI', 'title', 'abstract', 'num_paragraph'] | |
# using quantile to find outliers return a df | |
q1 = documents_with_text_df['abstract'].quantile(0.25) | |
q3 = documents_with_text_df['abstract'].quantile(0.75) | |
iqr = q3 - q1 | |
# We only consider extreme big value, since small value will not cause file explosion; | |
# Could choose other coefficient to find more "extreme" outliers. | |
filter = documents_with_text_df['abstract'] >= q3 + coef * iqr | |
# abstract outlier | |
outlier = documents_with_text_df.loc[filter] | |
sorted = outlier.sort_values(by=['abstract'], ascending=False) | |
tops = sorted.head(top) | |
tops.to_csv('length_outlier.csv') | |
print(f'found {len(documents_with_text_df.index)} outliers and saved top {top} into file length_outlier.csv') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Identifies outliers in CORD-19') | |
parser.add_argument('--path', type=str, required=True, help='the path to collection') | |
parser.add_argument('--coefficient', type=int, default=1.5, help='outlier coefficient') | |
parser.add_argument('--top', type=int, default=10, help='number of top outliers') | |
args = parser.parse_args() | |
main(args.path, args.coefficient, args.top) | |