lvwerra HF staff commited on
Commit
0b617d0
1 Parent(s): e5754d4

Update Space (evaluate main: 828c6327)

Browse files
Files changed (4) hide show
  1. README.md +57 -12
  2. app.py +6 -0
  3. requirements.txt +3 -0
  4. word_length.py +78 -0
README.md CHANGED
@@ -1,12 +1,57 @@
1
- ---
2
- title: Word_length
3
- emoji: 🐨
4
- colorFrom: pink
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 3.0.2
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Measurement Card for Word Length
2
+
3
+
4
+ ## Metric Description
5
+
6
+ The `word_length` measurement returns the word count of the input string, based on tokenization using [NLTK word_tokenize](https://www.nltk.org/api/nltk.tokenize.html).
7
+
8
+ ## How to Use
9
+
10
+ This measurement requires a list of strings as input:
11
+
12
+ ```python
13
+ >>> data = ["hello world"]
14
+ >>> wordlength = evaluate.load("word_length", type="measurement")
15
+ >>> results = wordlength.compute(data=data)
16
+ ```
17
+
18
+ ### Inputs
19
+ - **data** (list of `str`): The input list of strings for which the word length is calculated.
20
+ - **tokenizer** (`Callable`) : approach used for tokenizing `data` (optional). The default tokenizer is [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html). This can be replaced by any function that takes a string as input and returns a list of tokens as output.
21
+
22
+ ### Output Values
23
+ - **average_word_length**(`float`): the average number of words in the input string(s).
24
+
25
+ Output Example(s):
26
+
27
+ ```python
28
+ {"average_word_length": 245}
29
+ ```
30
+
31
+ This metric outputs a dictionary containing the number of words in the input string (`word length`).
32
+
33
+ ### Examples
34
+
35
+ Example for a single string
36
+
37
+ ```python
38
+ >>> data = ["hello sun and goodbye moon"]
39
+ >>> wordlength = evaluate.load("word_length", type="measurement")
40
+ >>> results = wordlength.compute(data=data)
41
+ >>> print(results)
42
+ {'average_length': 5}
43
+ ```
44
+
45
+ Example for a multiple strings
46
+ ```python
47
+ >>> data = ["hello sun and goodbye moon", "foo bar foo bar"]
48
+ >>> wordlength = evaluate.load("word_length", type="measurement")
49
+ >>> results = wordlength.compute(data=text)
50
+ {'average_length': 4.5}
51
+ ```
52
+
53
+ ## Citation(s)
54
+
55
+
56
+ ## Further References
57
+ - [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html)
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("word_length", type="measurement")
6
+ launch_gradio_widget(module)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/huggingface/evaluate.git@main
2
+ datasets~=2.0
3
+ nltk~=3.7
word_length.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from nltk import word_tokenize
16
+ import evaluate
17
+ import datasets
18
+ from statistics import mean
19
+
20
+
21
+ _DESCRIPTION = """
22
+ Returns the average length (in terms of the number of words) of the input data.
23
+ """
24
+
25
+ _KWARGS_DESCRIPTION = """
26
+ Args:
27
+ `data`: a list of `str` for which the word length is calculated.
28
+ `tokenizer` (`Callable`) : the approach used for tokenizing `data` (optional).
29
+ The default tokenizer is `word_tokenize` from NLTK: https://www.nltk.org/api/nltk.tokenize.html
30
+ This can be replaced by any function that takes a string as input and returns a list of tokens as output.
31
+
32
+ Returns:
33
+ `average_word_length` (`float`) : the average number of words in the input list of strings.
34
+
35
+ Examples:
36
+ >>> data = ["hello world"]
37
+ >>> wordlength = evaluate.load("word_length", type="measurement")
38
+ >>> results = wordlength.compute(data=data)
39
+ >>> print(results)
40
+ {'average_word_length': 2}
41
+ """
42
+
43
+ # TODO: Add BibTeX citation
44
+ _CITATION = """\
45
+ @InProceedings{huggingface:module,
46
+ title = {A great new module},
47
+ authors={huggingface, Inc.},
48
+ year={2020}
49
+ }
50
+ """
51
+
52
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
53
+ class WordLength(evaluate.EvaluationModule):
54
+ """This measurement returns the average number of words in the input string(s)."""
55
+
56
+ def _info(self):
57
+ # TODO: Specifies the evaluate.EvaluationModuleInfo object
58
+ return evaluate.EvaluationModuleInfo(
59
+ # This is the description that will appear on the modules page.
60
+ type="measurement",
61
+ description=_DESCRIPTION,
62
+ citation=_CITATION,
63
+ inputs_description=_KWARGS_DESCRIPTION,
64
+ # This defines the format of each prediction and reference
65
+ features=datasets.Features({
66
+ 'data': datasets.Value('string'),
67
+ })
68
+ )
69
+
70
+ def _download_and_prepare(self, dl_manager):
71
+ import nltk
72
+ nltk.download("punkt")
73
+
74
+ def _compute(self, data, tokenizer=word_tokenize):
75
+ """Returns the average word length of the input data"""
76
+ lengths = [len(tokenizer(d)) for d in data]
77
+ average_length = mean(lengths)
78
+ return {"average_word_length": average_length}