Spaces:

evaluate-measurement
/

word_count

Build error

App Files Files Community

lvwerra HF staff commited on Sep 22, 2022

Commit

2ce3448

1 Parent(s): 7a9f6c9

Update Space (evaluate main: e4a27243)

Browse files

Files changed (2) hide show

requirements.txt +1 -1
word_count.py +18 -3

requirements.txt CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- git+https://github.com/huggingface/evaluate.git@~~80448674f5447a9682afe051db243c4a13bfe4ff~~
2	sklearn~=0.0


1	+ git+https://github.com/huggingface/evaluate.git@e4a2724377909fe2aeb4357e3971e5a569673b39
2	sklearn~=0.0

word_count.py CHANGED Viewed

@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import datasets
 from sklearn.feature_extraction.text import CountVectorizer
@@ -41,18 +44,30 @@ Examples:
 _CITATION = ""
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class WordCount(evaluate.Measurement):
     """This measurement returns the total number of words and the number of unique words
     in the input string(s)."""
-    def _info(self):
         return evaluate.MeasurementInfo(
             # This is the description that will appear on the modules page.
             module_type="measurement",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             features=datasets.Features(
                 {
                     "data": datasets.Value("string"),
@@ -60,9 +75,9 @@ class WordCount(evaluate.Measurement):
             ),
         )
-    def _compute(self, data, max_vocab=None):
         """Returns the number of unique words in the input data"""
-        count_vectorizer = CountVectorizer(max_features=max_vocab)
         document_matrix = count_vectorizer.fit_transform(data)
         word_count = document_matrix.sum()
         unique_words = document_matrix.shape[1]

 # See the License for the specific language governing permissions and
 # limitations under the License.
+from dataclasses import dataclass
+from typing import Optional
 import datasets
 from sklearn.feature_extraction.text import CountVectorizer
 _CITATION = ""
+@dataclass
+class WordCount(evaluate.info.Config):
+    name: str = "default"
+    max_vocab: Optional[int] = None
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class WordCount(evaluate.Measurement):
     """This measurement returns the total number of words and the number of unique words
     in the input string(s)."""
+    CONFIG_CLASS = WordCount
+    ALLOWED_CONFIG_NAMES = ["default"]
+    def _info(self, config):
         return evaluate.MeasurementInfo(
             # This is the description that will appear on the modules page.
             module_type="measurement",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
+            config=config,
             features=datasets.Features(
                 {
                     "data": datasets.Value("string"),
             ),
         )
+    def _compute(self, data):
         """Returns the number of unique words in the input data"""
+        count_vectorizer = CountVectorizer(max_features=self.config.max_vocab)
         document_matrix = count_vectorizer.fit_transform(data)
         word_count = document_matrix.sum()
         unique_words = document_matrix.shape[1]