|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from statistics import mean |
|
|
|
import datasets |
|
from nltk import word_tokenize |
|
from packaging import version |
|
|
|
import evaluate |
|
|
|
|
|
if evaluate.config.PY_VERSION < version.parse("3.8"): |
|
import importlib_metadata |
|
else: |
|
import importlib.metadata as importlib_metadata |
|
|
|
|
|
NLTK_VERSION = version.parse(importlib_metadata.version("nltk")) |
|
|
|
_DESCRIPTION = """ |
|
Returns the average length (in terms of the number of words) of the input data. |
|
""" |
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Args: |
|
`data`: a list of `str` for which the word length is calculated. |
|
`tokenizer` (`Callable`) : the approach used for tokenizing `data` (optional). |
|
The default tokenizer is `word_tokenize` from NLTK: https://www.nltk.org/api/nltk.tokenize.html |
|
This can be replaced by any function that takes a string as input and returns a list of tokens as output. |
|
|
|
Returns: |
|
`average_word_length` (`float`) : the average number of words in the input list of strings. |
|
|
|
Examples: |
|
>>> data = ["hello world"] |
|
>>> wordlength = evaluate.load("word_length", module_type="measurement") |
|
>>> results = wordlength.compute(data=data) |
|
>>> print(results) |
|
{'average_word_length': 2} |
|
""" |
|
|
|
|
|
_CITATION = """\ |
|
@InProceedings{huggingface:module, |
|
title = {A great new module}, |
|
authors={huggingface, Inc.}, |
|
year={2020} |
|
} |
|
""" |
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class WordLength(evaluate.Measurement): |
|
"""This measurement returns the average number of words in the input string(s).""" |
|
|
|
def _info(self): |
|
|
|
return evaluate.MeasurementInfo( |
|
|
|
module_type="measurement", |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
|
features=datasets.Features( |
|
{ |
|
"data": datasets.Value("string"), |
|
} |
|
), |
|
) |
|
|
|
def _download_and_prepare(self, dl_manager): |
|
import nltk |
|
|
|
if NLTK_VERSION >= version.Version("3.9.0"): |
|
nltk.download("punkt_tab") |
|
else: |
|
nltk.download("punkt") |
|
|
|
def _compute(self, data, tokenizer=word_tokenize): |
|
"""Returns the average word length of the input data""" |
|
lengths = [len(tokenizer(d)) for d in data] |
|
average_length = mean(lengths) |
|
return {"average_word_length": average_length} |
|
|