lvwerra HF staff commited on
Commit
318295f
1 Parent(s): b93343f

Update Space (evaluate main: c447fc8e)

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -1
  2. text_duplicates.py +4 -16
requirements.txt CHANGED
@@ -1 +1 @@
1
- git+https://github.com/huggingface/evaluate.git@e4a2724377909fe2aeb4357e3971e5a569673b39
1
+ git+https://github.com/huggingface/evaluate.git@c447fc8eda9c62af501bfdc6988919571050d950
text_duplicates.py CHANGED
@@ -14,7 +14,6 @@
14
 
15
  import hashlib
16
  from collections import Counter
17
- from dataclasses import dataclass
18
 
19
  import datasets
20
 
@@ -58,29 +57,18 @@ def get_hash(example):
58
  return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
59
 
60
 
61
- @dataclass
62
- class TextDuplicatesConfig(evaluate.info.Config):
63
-
64
- name: str = "default"
65
-
66
- list_duplicates: bool = False
67
-
68
-
69
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
70
  class TextDuplicates(evaluate.Measurement):
71
  """This measurement returns the duplicate strings contained in the input(s)."""
72
 
73
- CONFIG_CLASS = TextDuplicatesConfig
74
- ALLOWED_CONFIG_NAMES = ["default"]
75
-
76
- def _info(self, config):
77
  return evaluate.MeasurementInfo(
78
  # This is the description that will appear on the modules page.
79
  module_type="measurement",
80
  description=_DESCRIPTION,
81
  citation=_CITATION,
82
  inputs_description=_KWARGS_DESCRIPTION,
83
- config=config,
84
  # This defines the format of each prediction and reference
85
  features=datasets.Features(
86
  {
@@ -89,9 +77,9 @@ class TextDuplicates(evaluate.Measurement):
89
  ),
90
  )
91
 
92
- def _compute(self, data):
93
  """Returns the duplicates contained in the input data and the number of times they are repeated."""
94
- if self.config.list_duplicates == True:
95
  logger.warning("This functionality can be memory-intensive for large datasets!")
96
  n_dedup = len(set([get_hash(d) for d in data]))
97
  c = Counter(data)
14
 
15
  import hashlib
16
  from collections import Counter
 
17
 
18
  import datasets
19
 
57
  return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
58
 
59
 
 
 
 
 
 
 
 
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
  class TextDuplicates(evaluate.Measurement):
62
  """This measurement returns the duplicate strings contained in the input(s)."""
63
 
64
+ def _info(self):
65
+ # TODO: Specifies the evaluate.MeasurementInfo object
 
 
66
  return evaluate.MeasurementInfo(
67
  # This is the description that will appear on the modules page.
68
  module_type="measurement",
69
  description=_DESCRIPTION,
70
  citation=_CITATION,
71
  inputs_description=_KWARGS_DESCRIPTION,
 
72
  # This defines the format of each prediction and reference
73
  features=datasets.Features(
74
  {
77
  ),
78
  )
79
 
80
+ def _compute(self, data, list_duplicates=False):
81
  """Returns the duplicates contained in the input data and the number of times they are repeated."""
82
+ if list_duplicates == True:
83
  logger.warning("This functionality can be memory-intensive for large datasets!")
84
  n_dedup = len(set([get_hash(d) for d in data]))
85
  c = Counter(data)