Spaces:
Build error
Build error
Update Space (evaluate main: e4a27243)
Browse files- requirements.txt +1 -1
- word_count.py +18 -3
requirements.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
git+https://github.com/huggingface/evaluate.git@
|
2 |
sklearn~=0.0
|
|
|
1 |
+
git+https://github.com/huggingface/evaluate.git@e4a2724377909fe2aeb4357e3971e5a569673b39
|
2 |
sklearn~=0.0
|
word_count.py
CHANGED
@@ -12,6 +12,9 @@
|
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
|
|
|
|
|
|
|
15 |
import datasets
|
16 |
from sklearn.feature_extraction.text import CountVectorizer
|
17 |
|
@@ -41,18 +44,30 @@ Examples:
|
|
41 |
_CITATION = ""
|
42 |
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
45 |
class WordCount(evaluate.Measurement):
|
46 |
"""This measurement returns the total number of words and the number of unique words
|
47 |
in the input string(s)."""
|
48 |
|
49 |
-
|
|
|
|
|
|
|
50 |
return evaluate.MeasurementInfo(
|
51 |
# This is the description that will appear on the modules page.
|
52 |
module_type="measurement",
|
53 |
description=_DESCRIPTION,
|
54 |
citation=_CITATION,
|
55 |
inputs_description=_KWARGS_DESCRIPTION,
|
|
|
56 |
features=datasets.Features(
|
57 |
{
|
58 |
"data": datasets.Value("string"),
|
@@ -60,9 +75,9 @@ class WordCount(evaluate.Measurement):
|
|
60 |
),
|
61 |
)
|
62 |
|
63 |
-
def _compute(self, data
|
64 |
"""Returns the number of unique words in the input data"""
|
65 |
-
count_vectorizer = CountVectorizer(max_features=max_vocab)
|
66 |
document_matrix = count_vectorizer.fit_transform(data)
|
67 |
word_count = document_matrix.sum()
|
68 |
unique_words = document_matrix.shape[1]
|
|
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
|
15 |
+
from dataclasses import dataclass
|
16 |
+
from typing import Optional
|
17 |
+
|
18 |
import datasets
|
19 |
from sklearn.feature_extraction.text import CountVectorizer
|
20 |
|
|
|
44 |
_CITATION = ""
|
45 |
|
46 |
|
47 |
+
@dataclass
|
48 |
+
class WordCount(evaluate.info.Config):
|
49 |
+
|
50 |
+
name: str = "default"
|
51 |
+
|
52 |
+
max_vocab: Optional[int] = None
|
53 |
+
|
54 |
+
|
55 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
56 |
class WordCount(evaluate.Measurement):
|
57 |
"""This measurement returns the total number of words and the number of unique words
|
58 |
in the input string(s)."""
|
59 |
|
60 |
+
CONFIG_CLASS = WordCount
|
61 |
+
ALLOWED_CONFIG_NAMES = ["default"]
|
62 |
+
|
63 |
+
def _info(self, config):
|
64 |
return evaluate.MeasurementInfo(
|
65 |
# This is the description that will appear on the modules page.
|
66 |
module_type="measurement",
|
67 |
description=_DESCRIPTION,
|
68 |
citation=_CITATION,
|
69 |
inputs_description=_KWARGS_DESCRIPTION,
|
70 |
+
config=config,
|
71 |
features=datasets.Features(
|
72 |
{
|
73 |
"data": datasets.Value("string"),
|
|
|
75 |
),
|
76 |
)
|
77 |
|
78 |
+
def _compute(self, data):
|
79 |
"""Returns the number of unique words in the input data"""
|
80 |
+
count_vectorizer = CountVectorizer(max_features=self.config.max_vocab)
|
81 |
document_matrix = count_vectorizer.fit_transform(data)
|
82 |
word_count = document_matrix.sum()
|
83 |
unique_words = document_matrix.shape[1]
|