Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

File size: 5,500 Bytes

d6585f5

#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import logging
import re
from enum import Enum

from ..multithreading import Counters
from ..pyclass import autoclass, cast, JPaths

logger = logging.getLogger(__name__)


JFileSegment = autoclass('io.anserini.collection.FileSegment')
JSourceDocument = autoclass('io.anserini.collection.SourceDocument')


class JCollections(Enum):
    AclAnthology = autoclass('io.anserini.collection.AclAnthology')
    CarCollection = autoclass('io.anserini.collection.CarCollection')
    Cord19AbstractCollection = autoclass('io.anserini.collection.Cord19AbstractCollection')
    ClueWeb09Collection = autoclass('io.anserini.collection.ClueWeb09Collection')
    ClueWeb12Collection = autoclass('io.anserini.collection.ClueWeb12Collection')
    HtmlCollection = autoclass('io.anserini.collection.HtmlCollection')
    JsonCollection = autoclass('io.anserini.collection.JsonCollection')
    NewYorkTimesCollection = autoclass('io.anserini.collection.NewYorkTimesCollection')
    TrecCollection = autoclass('io.anserini.collection.TrecCollection')
    TrecwebCollection = autoclass('io.anserini.collection.TrecwebCollection')
    TweetCollection = autoclass('io.anserini.collection.TweetCollection')
    WashingtonPostCollection = autoclass('io.anserini.collection.WashingtonPostCollection')
    WikipediaCollection = autoclass('io.anserini.collection.WikipediaCollection')


class Collection:
    """
    Iterable wrapper class for Anserini's DocumentCollection.

    Parameters
    ----------
    collection_class : str
        Name of collection class to instantiate
    collection_path : str
        Path to directory containing collection
    """

    def __init__(self, collection_class, collection_path):
        self.counters = Counters()
        self.collection_class = collection_class
        self.collection_path = JPaths.get(collection_path)
        self.object = self._get_collection()
        self.collection_iterator = self.object.iterator()

    def _get_collection(self):
        try:
            return JCollections[self.collection_class].value(self.collection_path)
        except:
            raise ValueError(self.collection_class)

    def __iter__(self):
        return self

    def __next__(self):
        if self.collection_iterator.hasNext():
            fs = self.collection_iterator.next()
            return FileSegment(self, fs, fs.getSegmentPath())
        else:
            raise StopIteration


class FileSegment:
    """
    Iterable wrapper class for Anserini's FileSegment.

    Parameters
    ----------
    collection : Collection
        Parent collection of the file segment
    segment : JFileSegment
        FileSegment object to create wrapper from
    segment_path : str
        Path to file backing the file segment
    """

    def __init__(self, collection, segment, segment_path):
        self.collection = collection
        try:
            self.object = cast(collection.object.getClass().getName() +
                               '$Segment', segment)
        except:
            logger.exception('Exception from casting FileSegment type...')
            self.object = cast('io.anserini.collection.FileSegment', segment)

        self.segment_iterator = self.object.iterator()
        self.segment_path = segment_path
        self.segment_name = re.sub(r'\\|\/', '-', collection.collection_path.relativize(segment_path).toString())

    def __iter__(self):
        return self

    def __next__(self):
        if self.object.iterator().hasNext():
            d = self.object.iterator().next()
            return SourceDocument(self, d)
        else:
            # log if iteration stopped by error
            if self.object.getErrorStatus():
                logger.error(self.segment_name + ': Error from segment iteration, stopping...')
                self.collection.counters.errors.increment()

            # stop iteration and log skipped documents
            skipped = self.object.getSkippedCount()
            if skipped > 0:
                self.collection.counters.skips.increment(skipped)
                logger.warning(self.segment_name + ': ' + str(skipped) + ' documents skipped')
            self.object.close()
            raise StopIteration


class SourceDocument:
    """
    Wrapper class for Anserini's SourceDocument.

    Parameters
    ----------

    segment : FileSegment
        Parent segment of the source document
    document : io.anserini.collection.SourceDocument
        SourceDocument object to create wrapper from
    """

    def __init__(self, segment, document):
        if not isinstance(document, JSourceDocument):
            raise TypeError('Invalid JSourceDocument!')
        self.segment = segment
        self.object = document
        self.id = self.object.id()
        self.indexable = self.object.indexable()
        self.contents = self.object.contents()
        self.raw = self.object.raw()