simondoebele commited on
Commit
6fe6c4e
1 Parent(s): 1bceae8

add new evaluation module

Browse files
Files changed (3) hide show
  1. app.py +6 -0
  2. autologiccreateworld.py +166 -0
  3. tests.py +17 -0
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("simondoebele/autologiccreateworld")
6
+ launch_gradio_widget(module)
autologiccreateworld.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """This metric module is designed to solve the task of creating a world model and corresponding keys,
15
+ given a formula and whether that formula needs to be satiesfied. Please read my M.Sc. thesis for details."""
16
+
17
+ import evaluate
18
+ import datasets
19
+ from Parser import parse_LLM_output
20
+ from nltk.sem.logic import *
21
+ import nltk
22
+ from nltk.sem.logic import LogicParser
23
+ from nltk.sem.evaluate import Valuation, Model
24
+
25
+
26
+ # TODO: Add BibTeX citation
27
+ _CITATION = """\
28
+ @InProceedings{huggingface:module,
29
+ title = {Teaching LLMs predicate Logic (M.Sc. Thesis)},
30
+ authors={Simon Döbele},
31
+ year={upcoming}
32
+ }
33
+ """
34
+
35
+ # TODO: Add description of the module here
36
+ _DESCRIPTION = """\
37
+ This metric module is designed to solve the task of creating a world model and corresponding keys,
38
+ given a formula and whether that formula needs to be satiesfied. Please read my M.Sc. thesis for details.
39
+
40
+ In summary, my compute function behaves like it usually does,
41
+ but in order to be able to compare references and predictions, I need to
42
+ pass the predictions to a parser. Hence, why I rewrote just the compute function.
43
+ """
44
+
45
+
46
+ # TODO: Add description of the arguments of the module here
47
+ _KWARGS_DESCRIPTION = """
48
+ Calculates how good are predictions given some references, using certain scores
49
+ Args:
50
+ predictions: list of predictions to score. Each predictions
51
+ should be a string with tokens separated by spaces.
52
+ references: list of references, one for each prediction. Each
53
+ reference should be a string with tokens separated by spaces.
54
+ formulas: list of strings, needed for the parser to decide
55
+ whether the overall construction is valid (satisfied or unsatisfied).
56
+ Returns:
57
+ accuracy: description of the first score,
58
+ another_score: description of the second score,
59
+ Examples:
60
+ Examples should be written in doctest format, and should illustrate how
61
+ to use the function.
62
+
63
+ >>> my_new_module = evaluate.load("my_new_module")
64
+ >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
65
+ >>> print(results)
66
+ {'accuracy': 1.0}
67
+ """
68
+
69
+ # TODO: Define external resources urls if needed (once thesis is published)
70
+ # BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
71
+
72
+
73
+ # def convert_valuation_back(valuation):
74
+ # # this is necessary, as jsonl could not serialize sets, but nltk expects sets for predicates.
75
+ # return [(v[0], set(v[1])) if v[0].isupper() else v for v in valuation]
76
+
77
+ # def eval_task1(dataset):
78
+ # results = []
79
+ # for generated_output, target, valuation in zip(dataset["Predictions"], dataset["Target-sat"], dataset["Valuation"]):
80
+ # try:
81
+ # generated_formula = Expression.fromstring(generated_output)
82
+ # #print("Parsed output:" + generated_formula)
83
+ # valuation = convert_valuation_back(valuation)
84
+ # #print("Valuation:" + valuation)
85
+ # val = Valuation(valuation)
86
+ # dom = val.domain
87
+ # m = nltk.sem.evaluate.Model(dom, val)
88
+ # g = nltk.sem.Assignment(dom)
89
+ # sat = m.evaluate(generated_formula, g)
90
+ # if sat == True:
91
+ # prediction = "satisfied"
92
+ # elif sat == False:
93
+ # prediction = "unsatisfied"
94
+ # except:
95
+ # prediction = "undefined"
96
+
97
+ # #print("Output:" + prediction + "-----Target:" + target)
98
+ # results.append(prediction==target)
99
+
100
+ # accuracy = sum(results)/len(results)
101
+ # return accuracy
102
+
103
+
104
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
105
+ class AutoLogicCreateWorld(evaluate.Metric):
106
+ """NOTE: only the compute function changes, compared to a normal evaluateMetric.compute()
107
+ as well as the datasets.Value (we are working with strings)."""
108
+
109
+ def _info(self):
110
+ # TODO: Specifies the evaluate.EvaluationModuleInfo object
111
+ return evaluate.MetricInfo(
112
+ # This is the description that will appear on the modules page.
113
+ module_type="metric",
114
+ description=_DESCRIPTION,
115
+ citation=_CITATION,
116
+ inputs_description=_KWARGS_DESCRIPTION,
117
+ # This defines the format of each prediction and reference
118
+ features=datasets.Features({
119
+ 'predictions': datasets.Value('string'),
120
+ 'references': datasets.Value('string'),
121
+ 'formulas': datasets.Value('string')
122
+ }),
123
+ # Homepage of the module for documentation
124
+ homepage="http://module.homepage", # TODO: change, once thesis is published
125
+ # Additional links to the codebase or references
126
+ codebase_urls=["http://github.com/path/to/codebase/of/new_module"], # TODO: change, once thesis is published
127
+ reference_urls=["http://path.to.reference.url/new_module"] # TODO: change, once thesis is published
128
+ )
129
+
130
+ def _download_and_prepare(self, dl_manager):
131
+ """Optional: download external resources useful to compute the scores"""
132
+ # TODO: Download external resources if needed
133
+ pass
134
+
135
+
136
+ def _compute(self, predictions, references, formulas):
137
+ """Returns the accuracy, given the parsed output."""
138
+
139
+ results = []
140
+ for generated_output, target, formula in zip(predictions, references, formulas):
141
+ v = parse_LLM_output(generated_output)
142
+ #print("Output:" + generated_output)
143
+ #print("Parsed output:")
144
+ #print(len(v))
145
+ val = Valuation(v)
146
+ dom = val.domain
147
+ m = nltk.sem.evaluate.Model(dom, val)
148
+ g = nltk.sem.Assignment(dom)
149
+ sat = m.evaluate(formula, g)
150
+
151
+ if len(v) == 0:
152
+ prediction = "undefined"
153
+ elif sat == True:
154
+ prediction = "satisfied"
155
+ elif sat == False:
156
+ prediction = "unsatisfied"
157
+ else:
158
+ prediction = "undefined"
159
+
160
+ #print("Output:" + prediction + "-----Target:" + target)
161
+ results.append(prediction==target)
162
+
163
+ accuracy = sum(results)/len(results)
164
+ return {
165
+ "accuracy": accuracy,
166
+ }
tests.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ test_cases = [
2
+ {
3
+ "predictions": [0, 0],
4
+ "references": [1, 1],
5
+ "result": {"metric_score": 0}
6
+ },
7
+ {
8
+ "predictions": [1, 1],
9
+ "references": [1, 1],
10
+ "result": {"metric_score": 1}
11
+ },
12
+ {
13
+ "predictions": [1, 0],
14
+ "references": [1, 1],
15
+ "result": {"metric_score": 0.5}
16
+ }
17
+ ]