davebulaval commited on
Commit
9db8db7
1 Parent(s): 7cd4a72

initial metrics codebase

Browse files
Files changed (4) hide show
  1. README.md +63 -2
  2. app.py +5 -0
  3. meaningbert.py +134 -0
  4. requirements.txt +1 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Meaningbert
3
  emoji: 🦀
4
  colorFrom: purple
5
  colorTo: indigo
@@ -9,4 +9,65 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: MeaningBERT
3
  emoji: 🦀
4
  colorFrom: purple
5
  colorTo: indigo
 
9
  pinned: false
10
  ---
11
 
12
+ # Here is MeaningBERT
13
+
14
+ MeaningBERT is an automatic and trainable metric for assessing meaning preservation between sentences. MeaningBERT was
15
+ proposed in our
16
+ article [MeaningBERT: assessing meaning preservation between sentences](https://www.frontiersin.org/articles/10.3389/frai.2023.1223924/full).
17
+ Its goal is to assess meaning preservation between two sentences that correlate highly with human judgments and sanity
18
+ checks. For more details, refer to our publicly available article.
19
+
20
+ > This public version of our model uses the best model trained (where in our article, we present the performance results
21
+ > of an average of 10 models) for a more extended period (1000 epochs instead of 250). We have observed later that the
22
+ > model can further reduce dev loss and increase performance.
23
+
24
+ ## Sanity Check
25
+
26
+ Correlation to human judgment is one way to evaluate the quality of a meaning preservation metric.
27
+ However, it is inherently subjective, since it uses human judgment as a gold standard, and expensive, since it requires
28
+ a large dataset
29
+ annotated by several humans. As an alternative, we designed two automated tests: evaluating meaning preservation between
30
+ identical sentences (which should be 100% preserving) and between unrelated sentences (which should be 0% preserving).
31
+ In these tests, the meaning preservation target value is not subjective and does not require human annotation to
32
+ measure. They represent a trivial and minimal threshold a good automatic meaning preservation metric should be able to
33
+ achieve. Namely, a metric should be minimally able to return a perfect score (i.e., 100%) if two identical sentences are
34
+ compared and return a null score (i.e., 0%) if two sentences are completely unrelated.
35
+
36
+ ### Identical sentences
37
+
38
+ The first test evaluates meaning preservation between identical sentences. To analyze the metrics' capabilities to pass
39
+ this test, we count the number of times a metric rating was greater or equal to a threshold value X∈[95, 99] and divide
40
+ it by the number of sentences to create a ratio of the number of times the metric gives the expected rating. To account
41
+ for computer floating-point inaccuracy, we round the ratings to the nearest integer and do not use a threshold value of
42
+ 100%.
43
+
44
+ ### Unrelated sentences
45
+
46
+ Our second test evaluates meaning preservation between a source sentence and an unrelated sentence generated by a large
47
+ language model.3 The idea is to verify that the metric finds a meaning preservation rating of 0 when given a completely
48
+ irrelevant sentence mainly composed of irrelevant words (also known as word soup). Since this test's expected rating is
49
+ 0, we check that the metric rating is lower or equal to a threshold value X∈[5, 1].
50
+ Again, to account for computer floating-point inaccuracy, we round the ratings to the nearest integer and do not use
51
+ a threshold value of 0%.
52
+
53
+ ## Cite
54
+
55
+ Use the following citation to cite MeaningBERT
56
+
57
+ ```
58
+ @ARTICLE{10.3389/frai.2023.1223924,
59
+ AUTHOR={Beauchemin, David and Saggion, Horacio and Khoury, Richard},
60
+ TITLE={MeaningBERT: assessing meaning preservation between sentences},
61
+ JOURNAL={Frontiers in Artificial Intelligence},
62
+ VOLUME={6},
63
+ YEAR={2023},
64
+ URL={https://www.frontiersin.org/articles/10.3389/frai.2023.1223924},
65
+ DOI={10.3389/frai.2023.1223924},
66
+ ISSN={2624-8212},
67
+ }
68
+ ```
69
+
70
+ ## License
71
+
72
+ MeaningBERT is MIT licensed, as found in
73
+ the [LICENSE file](https://github.com/GRAAL-Research/risc/blob/main/LICENSE).
app.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+ module = evaluate.load("meaningbert")
5
+ launch_gradio_widget(module)
meaningbert.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Evaluate Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """ MeaningBERT metric. """
15
+
16
+ from contextlib import contextmanager
17
+ from typing import List, Dict
18
+
19
+ import datasets
20
+ import evaluate
21
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
22
+
23
+
24
+ @contextmanager
25
+ def filter_logging_context():
26
+ def filter_log(record):
27
+ return False if "This IS expected if you are initializing" in record.msg else True
28
+
29
+ logger = datasets.utils.logging.get_logger("transformers.modeling_utils")
30
+ logger.addFilter(filter_log)
31
+ try:
32
+ yield
33
+ finally:
34
+ logger.removeFilter(filter_log)
35
+
36
+
37
+ _CITATION = """\
38
+ @ARTICLE{10.3389/frai.2023.1223924,
39
+ AUTHOR={Beauchemin, David and Saggion, Horacio and Khoury, Richard},
40
+ TITLE={MeaningBERT: assessing meaning preservation between sentences},
41
+ JOURNAL={Frontiers in Artificial Intelligence},
42
+ VOLUME={6},
43
+ YEAR={2023},
44
+ URL={https://www.frontiersin.org/articles/10.3389/frai.2023.1223924},
45
+ DOI={10.3389/frai.2023.1223924},
46
+ ISSN={2624-8212},
47
+ }
48
+ """
49
+
50
+ _DESCRIPTION = """\
51
+ MeaningBERT is an automatic and trainable metric for assessing meaning preservation between sentences. MeaningBERT was
52
+ proposed in our
53
+ article [MeaningBERT: assessing meaning preservation between sentences](https://www.frontiersin.org/articles/10.3389/frai.2023.1223924/full).
54
+ Its goal is to assess meaning preservation between two sentences that correlate highly with human judgments and sanity
55
+ checks. For more details, refer to our publicly available article.
56
+
57
+ See the project's README at https://github.com/GRAAL-Research/MeaningBERT for more information.
58
+ """
59
+
60
+ _KWARGS_DESCRIPTION = """
61
+ MeaningBERT metric for assessing meaning preservation between sentences.
62
+
63
+ Args:
64
+ documents (list of str): Document sentences.
65
+ simplifications (list of str): Simplification sentences (same number of element as documents).
66
+ verbose (bool): Turn on intermediate status update.
67
+
68
+ Returns:
69
+ score: the meaning score between two sentences in alist format respecting the order of the documents and
70
+ simplifications pairs.
71
+ hashcode: Hashcode of the library.
72
+
73
+ Examples:
74
+
75
+ >>> documents = ["hello there", "general kenobi"]
76
+ >>> simplifications = ["hello there", "general kenobi"]
77
+ >>> meaning_bert = evaluate.load("meaningbert")
78
+ >>> results = meaning_bert.compute(documents=documents, simplifications=simplifications)
79
+ """
80
+
81
+ _HASH = "21845c0cc85a2e8e16c89bb0053f489095cf64c5b19e9c3865d3e10047aba51b"
82
+
83
+
84
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
85
+ class MeaningBERTScore(evaluate.Metric):
86
+ def _info(self):
87
+ return evaluate.MetricInfo(
88
+ description=_DESCRIPTION,
89
+ citation=_CITATION,
90
+ homepage="https://github.com/GRAAL-Research/MeaningBERT",
91
+ inputs_description=_KWARGS_DESCRIPTION,
92
+ features=[
93
+ datasets.Features(
94
+ {
95
+ "documents": datasets.Value("string", id="sequence"),
96
+ "simplifications": datasets.Value("string", id="sequence"),
97
+ }
98
+ )
99
+ ],
100
+ codebase_urls=["https://github.com/GRAAL-Research/MeaningBERT"],
101
+ reference_urls=[
102
+ "https://github.com/GRAAL-Research/MeaningBERT",
103
+ "https://www.frontiersin.org/articles/10.3389/frai.2023.1223924/full",
104
+ ],
105
+ )
106
+
107
+ def _compute(
108
+ self,
109
+ documents: List,
110
+ simplifications: List,
111
+ verbose: bool = False,
112
+ ) -> Dict:
113
+ assert len(documents) == len(
114
+ simplifications), "The number of document is different of the number of simplications."
115
+ hashcode = _HASH
116
+
117
+ # We load the MeaningBERT pretrained model
118
+ scorer = AutoModelForSequenceClassification.from_pretrained("davebulaval/MeaningBERT")
119
+
120
+ # We load MeaningBERT tokenizer
121
+ tokenizer = AutoTokenizer.from_pretrained("davebulaval/MeaningBERT")
122
+
123
+ # We tokenize the text as a pair and return Pytorch Tensors
124
+ tokenize_text = tokenizer(documents, simplifications, truncation=True, padding=True, return_tensors="pt")
125
+
126
+ with filter_logging_context():
127
+ # We process the text
128
+ scores = scorer(**tokenize_text)
129
+
130
+ output_dict = {
131
+ "scores": scores.logits.tolist(),
132
+ "hashcode": hashcode,
133
+ }
134
+ return output_dict
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ evaluate