Spaces:
Runtime error
Runtime error
File size: 1,562 Bytes
bfc0ec6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
"""Text splitters using spaCy."""
from typing import TYPE_CHECKING, Any, Iterable, Optional
from typing_extensions import override
from ..schema import Item, RichData, lilac_span
from ..signal import TextSplitterSignal
if TYPE_CHECKING:
from spacy.language import Language
class SentenceSplitterSpacy(TextSplitterSignal):
"""Splits documents into sentences using the SpaCy sentence tokenizer."""
name = 'sentences'
display_name = 'Sentence Splitter'
language: str = 'en'
_tokenizer: 'Language'
def __init__(self, **kwargs: Any):
super().__init__(**kwargs)
@override
def setup(self) -> None:
try:
import spacy
except ImportError:
raise ImportError('Could not import the "spacy" python package. '
'Please install it with `pip install spacy`.')
self._tokenizer = spacy.blank(self.language)
self._tokenizer.add_pipe('sentencizer')
# Increase the number of characters of the tokenizer as we're not using a parser or NER.
self._tokenizer.max_length = 10_000_000
@override
def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
text_data = (row if isinstance(row, str) else '' for row in data)
for doc in self._tokenizer.pipe(text_data):
sentences = doc.sents
result = [lilac_span(token.start_char, token.end_char) for token in sentences]
if result:
yield result
else:
yield None
class Config:
# Language is required even though it has a default value.
schema_extra = {'required': ['language']}
|