Update Space (evaluate main: 828c6327)
Browse files- README.md +57 -12
- app.py +6 -0
- requirements.txt +3 -0
- word_length.py +78 -0
README.md
CHANGED
@@ -1,12 +1,57 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Measurement Card for Word Length
|
2 |
+
|
3 |
+
|
4 |
+
## Metric Description
|
5 |
+
|
6 |
+
The `word_length` measurement returns the word count of the input string, based on tokenization using [NLTK word_tokenize](https://www.nltk.org/api/nltk.tokenize.html).
|
7 |
+
|
8 |
+
## How to Use
|
9 |
+
|
10 |
+
This measurement requires a list of strings as input:
|
11 |
+
|
12 |
+
```python
|
13 |
+
>>> data = ["hello world"]
|
14 |
+
>>> wordlength = evaluate.load("word_length", type="measurement")
|
15 |
+
>>> results = wordlength.compute(data=data)
|
16 |
+
```
|
17 |
+
|
18 |
+
### Inputs
|
19 |
+
- **data** (list of `str`): The input list of strings for which the word length is calculated.
|
20 |
+
- **tokenizer** (`Callable`) : approach used for tokenizing `data` (optional). The default tokenizer is [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html). This can be replaced by any function that takes a string as input and returns a list of tokens as output.
|
21 |
+
|
22 |
+
### Output Values
|
23 |
+
- **average_word_length**(`float`): the average number of words in the input string(s).
|
24 |
+
|
25 |
+
Output Example(s):
|
26 |
+
|
27 |
+
```python
|
28 |
+
{"average_word_length": 245}
|
29 |
+
```
|
30 |
+
|
31 |
+
This metric outputs a dictionary containing the number of words in the input string (`word length`).
|
32 |
+
|
33 |
+
### Examples
|
34 |
+
|
35 |
+
Example for a single string
|
36 |
+
|
37 |
+
```python
|
38 |
+
>>> data = ["hello sun and goodbye moon"]
|
39 |
+
>>> wordlength = evaluate.load("word_length", type="measurement")
|
40 |
+
>>> results = wordlength.compute(data=data)
|
41 |
+
>>> print(results)
|
42 |
+
{'average_length': 5}
|
43 |
+
```
|
44 |
+
|
45 |
+
Example for a multiple strings
|
46 |
+
```python
|
47 |
+
>>> data = ["hello sun and goodbye moon", "foo bar foo bar"]
|
48 |
+
>>> wordlength = evaluate.load("word_length", type="measurement")
|
49 |
+
>>> results = wordlength.compute(data=text)
|
50 |
+
{'average_length': 4.5}
|
51 |
+
```
|
52 |
+
|
53 |
+
## Citation(s)
|
54 |
+
|
55 |
+
|
56 |
+
## Further References
|
57 |
+
- [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html)
|
app.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import evaluate
|
2 |
+
from evaluate.utils import launch_gradio_widget
|
3 |
+
|
4 |
+
|
5 |
+
module = evaluate.load("word_length", type="measurement")
|
6 |
+
launch_gradio_widget(module)
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/evaluate.git@main
|
2 |
+
datasets~=2.0
|
3 |
+
nltk~=3.7
|
word_length.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from nltk import word_tokenize
|
16 |
+
import evaluate
|
17 |
+
import datasets
|
18 |
+
from statistics import mean
|
19 |
+
|
20 |
+
|
21 |
+
_DESCRIPTION = """
|
22 |
+
Returns the average length (in terms of the number of words) of the input data.
|
23 |
+
"""
|
24 |
+
|
25 |
+
_KWARGS_DESCRIPTION = """
|
26 |
+
Args:
|
27 |
+
`data`: a list of `str` for which the word length is calculated.
|
28 |
+
`tokenizer` (`Callable`) : the approach used for tokenizing `data` (optional).
|
29 |
+
The default tokenizer is `word_tokenize` from NLTK: https://www.nltk.org/api/nltk.tokenize.html
|
30 |
+
This can be replaced by any function that takes a string as input and returns a list of tokens as output.
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
`average_word_length` (`float`) : the average number of words in the input list of strings.
|
34 |
+
|
35 |
+
Examples:
|
36 |
+
>>> data = ["hello world"]
|
37 |
+
>>> wordlength = evaluate.load("word_length", type="measurement")
|
38 |
+
>>> results = wordlength.compute(data=data)
|
39 |
+
>>> print(results)
|
40 |
+
{'average_word_length': 2}
|
41 |
+
"""
|
42 |
+
|
43 |
+
# TODO: Add BibTeX citation
|
44 |
+
_CITATION = """\
|
45 |
+
@InProceedings{huggingface:module,
|
46 |
+
title = {A great new module},
|
47 |
+
authors={huggingface, Inc.},
|
48 |
+
year={2020}
|
49 |
+
}
|
50 |
+
"""
|
51 |
+
|
52 |
+
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
53 |
+
class WordLength(evaluate.EvaluationModule):
|
54 |
+
"""This measurement returns the average number of words in the input string(s)."""
|
55 |
+
|
56 |
+
def _info(self):
|
57 |
+
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
58 |
+
return evaluate.EvaluationModuleInfo(
|
59 |
+
# This is the description that will appear on the modules page.
|
60 |
+
type="measurement",
|
61 |
+
description=_DESCRIPTION,
|
62 |
+
citation=_CITATION,
|
63 |
+
inputs_description=_KWARGS_DESCRIPTION,
|
64 |
+
# This defines the format of each prediction and reference
|
65 |
+
features=datasets.Features({
|
66 |
+
'data': datasets.Value('string'),
|
67 |
+
})
|
68 |
+
)
|
69 |
+
|
70 |
+
def _download_and_prepare(self, dl_manager):
|
71 |
+
import nltk
|
72 |
+
nltk.download("punkt")
|
73 |
+
|
74 |
+
def _compute(self, data, tokenizer=word_tokenize):
|
75 |
+
"""Returns the average word length of the input data"""
|
76 |
+
lengths = [len(tokenizer(d)) for d in data]
|
77 |
+
average_length = mean(lengths)
|
78 |
+
return {"average_word_length": average_length}
|