nikhil_staging / src /signals /text_statistics.py
nsthorat's picture
Push
e4f9cbe
raw
history blame
No virus
2.06 kB
"""Compute text statistics for a document."""
from typing import Iterable, Optional, cast
import spacy
import textacy
from spacy import Language
from spacy.tokens import Doc
from textacy import text_stats
from typing_extensions import override
from ..schema import Field, Item, RichData, field
from ..utils import chunks
from .signal import TextSignal
SPACY_LANG_MODEL = 'en_core_web_sm'
SPACY_BATCH_SIZE = 128
NUM_CHARS = 'num_characters'
READABILITY = 'readability'
TYPE_TOKEN_RATIO = 'type_token_ratio'
class TextStatisticsSignal(TextSignal):
"""Compute text statistics for a document such as readability scores, type-token-ratio, etc.."""
name = 'text_statistics'
display_name = 'Text Statistics'
_lang: Optional[Language] = None
@override
def fields(self) -> Field:
return field(fields={
NUM_CHARS: 'int32',
READABILITY: 'float32',
TYPE_TOKEN_RATIO: 'float32',
})
@override
def setup(self) -> None:
if not spacy.util.is_package(SPACY_LANG_MODEL):
spacy.cli.download(SPACY_LANG_MODEL)
self._lang = spacy.load(
SPACY_LANG_MODEL,
disable=[
'parser', 'tagger', 'ner', 'lemmatizer', 'textcat', 'custom', 'tok2vec', 'attribute_ruler'
])
@override
def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
for batch in chunks(data, SPACY_BATCH_SIZE):
# Replace None with empty strings to avoid spacy errors.
batch = [x or '' for x in batch]
# See https://textacy.readthedocs.io/en/0.11.0/api_reference/text_stats.html for a list of
# available statistics.
corpus = textacy.Corpus(lang=self._lang, data=batch)
for doc in cast(Iterable[Doc], corpus):
if not len(doc):
yield None
continue
readability = text_stats.readability.automated_readability_index(doc)
ttr = text_stats.diversity.ttr(doc)
num_chars = text_stats.basics.n_chars(doc)
yield {
NUM_CHARS: num_chars,
READABILITY: readability,
TYPE_TOKEN_RATIO: ttr,
}