lvwerra HF staff commited on
Commit
3cb08e2
1 Parent(s): 69cb5ac

Update Space (evaluate main: eaf34a7d)

Browse files
Files changed (3) hide show
  1. README.md +11 -1
  2. requirements.txt +1 -1
  3. rouge.py +20 -7
README.md CHANGED
@@ -42,10 +42,20 @@ At minimum, this metric takes as input a list of predictions and a list of refer
42
  {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
43
  ```
44
 
 
 
 
 
 
 
 
 
 
 
45
  ### Inputs
46
  - **predictions** (`list`): list of predictions to score. Each prediction
47
  should be a string with tokens separated by spaces.
48
- - **references** (`list`): list of reference for each prediction. Each
49
  reference should be a string with tokens separated by spaces.
50
  - **rouge_types** (`list`): A list of rouge types to calculate. Defaults to `['rouge1', 'rouge2', 'rougeL', 'rougeLsum']`.
51
  - Valid rouge types:
 
42
  {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
43
  ```
44
 
45
+ It can also deal with lists of references for each predictions:
46
+ ```python
47
+ >>> rouge = evaluate.load('rouge')
48
+ >>> predictions = ["hello there", "general kenobi"]
49
+ >>> references = [["hello", "there"], ["general kenobi", "general yoda"]]
50
+ >>> results = rouge.compute(predictions=predictions,
51
+ ... references=references)
52
+ >>> print(results)
53
+ {'rouge1': 0.8333, 'rouge2': 0.5, 'rougeL': 0.8333, 'rougeLsum': 0.8333}```
54
+
55
  ### Inputs
56
  - **predictions** (`list`): list of predictions to score. Each prediction
57
  should be a string with tokens separated by spaces.
58
+ - **references** (`list` or `list[list]`): list of reference for each prediction or a list of several references per prediction. Each
59
  reference should be a string with tokens separated by spaces.
60
  - **rouge_types** (`list`): A list of rouge types to calculate. Defaults to `['rouge1', 'rouge2', 'rougeL', 'rougeLsum']`.
61
  - Valid rouge types:
requirements.txt CHANGED
@@ -2,4 +2,4 @@ git+https://github.com/huggingface/evaluate@a45df1eb9996eec64ec3282ebe554061cb36
2
  datasets~=2.0
3
  absl-py
4
  nltk
5
- rouge_score
 
2
  datasets~=2.0
3
  absl-py
4
  nltk
5
+ rouge_score>=0.1.2
rouge.py CHANGED
@@ -87,12 +87,20 @@ class Rouge(evaluate.Metric):
87
  description=_DESCRIPTION,
88
  citation=_CITATION,
89
  inputs_description=_KWARGS_DESCRIPTION,
90
- features=datasets.Features(
91
- {
92
- "predictions": datasets.Value("string", id="sequence"),
93
- "references": datasets.Value("string", id="sequence"),
94
- }
95
- ),
 
 
 
 
 
 
 
 
96
  codebase_urls=["https://github.com/google-research/google-research/tree/master/rouge"],
97
  reference_urls=[
98
  "https://en.wikipedia.org/wiki/ROUGE_(metric)",
@@ -104,6 +112,8 @@ class Rouge(evaluate.Metric):
104
  if rouge_types is None:
105
  rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
106
 
 
 
107
  scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer)
108
  if use_aggregator:
109
  aggregator = scoring.BootstrapAggregator()
@@ -111,7 +121,10 @@ class Rouge(evaluate.Metric):
111
  scores = []
112
 
113
  for ref, pred in zip(references, predictions):
114
- score = scorer.score(ref, pred)
 
 
 
115
  if use_aggregator:
116
  aggregator.add_scores(score)
117
  else:
 
87
  description=_DESCRIPTION,
88
  citation=_CITATION,
89
  inputs_description=_KWARGS_DESCRIPTION,
90
+ features=[
91
+ datasets.Features(
92
+ {
93
+ "predictions": datasets.Value("string", id="sequence"),
94
+ "references": datasets.Sequence(datasets.Value("string", id="sequence")),
95
+ }
96
+ ),
97
+ datasets.Features(
98
+ {
99
+ "predictions": datasets.Value("string", id="sequence"),
100
+ "references": datasets.Value("string", id="sequence"),
101
+ }
102
+ ),
103
+ ],
104
  codebase_urls=["https://github.com/google-research/google-research/tree/master/rouge"],
105
  reference_urls=[
106
  "https://en.wikipedia.org/wiki/ROUGE_(metric)",
 
112
  if rouge_types is None:
113
  rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
114
 
115
+ multi_ref = isinstance(references[0], list)
116
+
117
  scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer)
118
  if use_aggregator:
119
  aggregator = scoring.BootstrapAggregator()
 
121
  scores = []
122
 
123
  for ref, pred in zip(references, predictions):
124
+ if multi_ref:
125
+ score = scorer.score_multi(ref, pred)
126
+ else:
127
+ score = scorer.score(ref, pred)
128
  if use_aggregator:
129
  aggregator.add_scores(score)
130
  else: