Spaces:
Runtime error
Runtime error
# | |
# Pyserini: Reproducible IR research with sparse and dense representations | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
import logging | |
import re | |
from enum import Enum | |
from ..multithreading import Counters | |
from ..pyclass import autoclass, cast, JPaths | |
logger = logging.getLogger(__name__) | |
JFileSegment = autoclass('io.anserini.collection.FileSegment') | |
JSourceDocument = autoclass('io.anserini.collection.SourceDocument') | |
class JCollections(Enum): | |
AclAnthology = autoclass('io.anserini.collection.AclAnthology') | |
CarCollection = autoclass('io.anserini.collection.CarCollection') | |
Cord19AbstractCollection = autoclass('io.anserini.collection.Cord19AbstractCollection') | |
ClueWeb09Collection = autoclass('io.anserini.collection.ClueWeb09Collection') | |
ClueWeb12Collection = autoclass('io.anserini.collection.ClueWeb12Collection') | |
HtmlCollection = autoclass('io.anserini.collection.HtmlCollection') | |
JsonCollection = autoclass('io.anserini.collection.JsonCollection') | |
NewYorkTimesCollection = autoclass('io.anserini.collection.NewYorkTimesCollection') | |
TrecCollection = autoclass('io.anserini.collection.TrecCollection') | |
TrecwebCollection = autoclass('io.anserini.collection.TrecwebCollection') | |
TweetCollection = autoclass('io.anserini.collection.TweetCollection') | |
WashingtonPostCollection = autoclass('io.anserini.collection.WashingtonPostCollection') | |
WikipediaCollection = autoclass('io.anserini.collection.WikipediaCollection') | |
class Collection: | |
""" | |
Iterable wrapper class for Anserini's DocumentCollection. | |
Parameters | |
---------- | |
collection_class : str | |
Name of collection class to instantiate | |
collection_path : str | |
Path to directory containing collection | |
""" | |
def __init__(self, collection_class, collection_path): | |
self.counters = Counters() | |
self.collection_class = collection_class | |
self.collection_path = JPaths.get(collection_path) | |
self.object = self._get_collection() | |
self.collection_iterator = self.object.iterator() | |
def _get_collection(self): | |
try: | |
return JCollections[self.collection_class].value(self.collection_path) | |
except: | |
raise ValueError(self.collection_class) | |
def __iter__(self): | |
return self | |
def __next__(self): | |
if self.collection_iterator.hasNext(): | |
fs = self.collection_iterator.next() | |
return FileSegment(self, fs, fs.getSegmentPath()) | |
else: | |
raise StopIteration | |
class FileSegment: | |
""" | |
Iterable wrapper class for Anserini's FileSegment. | |
Parameters | |
---------- | |
collection : Collection | |
Parent collection of the file segment | |
segment : JFileSegment | |
FileSegment object to create wrapper from | |
segment_path : str | |
Path to file backing the file segment | |
""" | |
def __init__(self, collection, segment, segment_path): | |
self.collection = collection | |
try: | |
self.object = cast(collection.object.getClass().getName() + | |
'$Segment', segment) | |
except: | |
logger.exception('Exception from casting FileSegment type...') | |
self.object = cast('io.anserini.collection.FileSegment', segment) | |
self.segment_iterator = self.object.iterator() | |
self.segment_path = segment_path | |
self.segment_name = re.sub(r'\\|\/', '-', collection.collection_path.relativize(segment_path).toString()) | |
def __iter__(self): | |
return self | |
def __next__(self): | |
if self.object.iterator().hasNext(): | |
d = self.object.iterator().next() | |
return SourceDocument(self, d) | |
else: | |
# log if iteration stopped by error | |
if self.object.getErrorStatus(): | |
logger.error(self.segment_name + ': Error from segment iteration, stopping...') | |
self.collection.counters.errors.increment() | |
# stop iteration and log skipped documents | |
skipped = self.object.getSkippedCount() | |
if skipped > 0: | |
self.collection.counters.skips.increment(skipped) | |
logger.warning(self.segment_name + ': ' + str(skipped) + ' documents skipped') | |
self.object.close() | |
raise StopIteration | |
class SourceDocument: | |
""" | |
Wrapper class for Anserini's SourceDocument. | |
Parameters | |
---------- | |
segment : FileSegment | |
Parent segment of the source document | |
document : io.anserini.collection.SourceDocument | |
SourceDocument object to create wrapper from | |
""" | |
def __init__(self, segment, document): | |
if not isinstance(document, JSourceDocument): | |
raise TypeError('Invalid JSourceDocument!') | |
self.segment = segment | |
self.object = document | |
self.id = self.object.id() | |
self.indexable = self.object.indexable() | |
self.contents = self.object.contents() | |
self.raw = self.object.raw() | |