Spaces:
Runtime error
Runtime error
File size: 2,064 Bytes
e4f9cbe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
"""Compute text statistics for a document."""
from typing import Iterable, Optional, cast
import spacy
import textacy
from spacy import Language
from spacy.tokens import Doc
from textacy import text_stats
from typing_extensions import override
from ..schema import Field, Item, RichData, field
from ..utils import chunks
from .signal import TextSignal
SPACY_LANG_MODEL = 'en_core_web_sm'
SPACY_BATCH_SIZE = 128
NUM_CHARS = 'num_characters'
READABILITY = 'readability'
TYPE_TOKEN_RATIO = 'type_token_ratio'
class TextStatisticsSignal(TextSignal):
"""Compute text statistics for a document such as readability scores, type-token-ratio, etc.."""
name = 'text_statistics'
display_name = 'Text Statistics'
_lang: Optional[Language] = None
@override
def fields(self) -> Field:
return field(fields={
NUM_CHARS: 'int32',
READABILITY: 'float32',
TYPE_TOKEN_RATIO: 'float32',
})
@override
def setup(self) -> None:
if not spacy.util.is_package(SPACY_LANG_MODEL):
spacy.cli.download(SPACY_LANG_MODEL)
self._lang = spacy.load(
SPACY_LANG_MODEL,
disable=[
'parser', 'tagger', 'ner', 'lemmatizer', 'textcat', 'custom', 'tok2vec', 'attribute_ruler'
])
@override
def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
for batch in chunks(data, SPACY_BATCH_SIZE):
# Replace None with empty strings to avoid spacy errors.
batch = [x or '' for x in batch]
# See https://textacy.readthedocs.io/en/0.11.0/api_reference/text_stats.html for a list of
# available statistics.
corpus = textacy.Corpus(lang=self._lang, data=batch)
for doc in cast(Iterable[Doc], corpus):
if not len(doc):
yield None
continue
readability = text_stats.readability.automated_readability_index(doc)
ttr = text_stats.diversity.ttr(doc)
num_chars = text_stats.basics.n_chars(doc)
yield {
NUM_CHARS: num_chars,
READABILITY: readability,
TYPE_TOKEN_RATIO: ttr,
}
|