svenwey commited on
Commit
e25b5dd
·
1 Parent(s): 7c61712

adapt logmetric to also make local per-logmsg test using sacrebleu

Browse files
Files changed (1) hide show
  1. logmetric.py +71 -21
logmetric.py CHANGED
@@ -67,7 +67,7 @@ class LogMetric(evaluate.Metric):
67
  """TODO: Short description of my evaluation module."""
68
 
69
  # Constant regex to get timestrings
70
- timestamp_regex = r'(^\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)'
71
  timestamp_pattern = re.compile(timestamp_regex, re.MULTILINE)
72
 
73
  def _info(self):
@@ -96,54 +96,104 @@ class LogMetric(evaluate.Metric):
96
  # TODO: Download external resources if needed
97
  pass
98
 
99
- def getLogMetric(self, pred : str, ref : str):
100
  ref = ref.strip(' \t\n\r')
101
  pred = pred.strip(' \t\n\r')
102
 
103
  # Find all timestrings in the log
104
- pred_timestrings = self.timestamp_pattern.findall(pred)
105
- ref_timestrings = self.timestamp_pattern.findall(ref)
106
-
107
- #Check if there is the correct amount of timestrings in the prediction
108
- if(len(pred_timestrings) != len(ref_timestrings)):
109
- return 0.0
 
 
 
 
 
 
 
 
 
 
110
 
111
- # If there are no timestrings, we must not check anything, we can directly return 1.0
112
- if (len(pred_timestrings) == 0):
113
- return 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
 
 
 
 
 
 
115
  # replace all digits in the reference timestamp (first timestamp) with '/d' to get
116
  # a regex that describes the format
117
- pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_timestrings[0]))
118
 
 
 
 
 
 
 
119
  # A variable to save the previous timestamp (as datetime obj) to check monotonicity
120
  prev_datetime = None
121
  # Convert matches to datetime objects
122
- for ts in pred_timestrings:
 
 
 
123
  try:
124
  # Check if the format matches with the format of the first timestamp
 
125
  matchesPattern = re.fullmatch(pred_timestring_pattern, ts) is not None
126
  # Check if the timestamps are monotonically increasing
127
  cur_datetime = dateutil.parser.parse(ts)
128
  monotonicallyIncreasing = True if prev_datetime == None else prev_datetime <= cur_datetime
129
  prev_datetime = cur_datetime
130
 
131
- if not (matchesPattern and monotonicallyIncreasing):
132
- # timestamps not consistent
133
- return 0.0
 
 
134
 
135
  except Exception as e:
136
  # e.g. date format not parsable by dateutil.parser
137
- return 0.0
138
-
 
 
 
 
 
 
 
139
  # Correct amt of timestrings, monotonically increasing, consistent + (by dateutil.parser) parsable format
140
- return 1.0
141
 
142
- def _compute(self, predictions, references):
143
  """Returns the scores"""
144
 
 
 
145
  t_before_logmetric = time.perf_counter()
146
- timestamp_score = np.mean([self.getLogMetric(p,r) for p,r in zip(predictions,references)])
147
  t_after_logmetric = time.perf_counter()
148
 
149
  logmetric_duration = f" {t_after_logmetric - t_before_logmetric:0.10f}"
 
67
  """TODO: Short description of my evaluation module."""
68
 
69
  # Constant regex to get timestrings
70
+ timestamp_regex = r'^\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*'
71
  timestamp_pattern = re.compile(timestamp_regex, re.MULTILINE)
72
 
73
  def _info(self):
 
96
  # TODO: Download external resources if needed
97
  pass
98
 
99
+ def getLogMetric(self, pred : str, ref : str, sacrebleu):
100
  ref = ref.strip(' \t\n\r')
101
  pred = pred.strip(' \t\n\r')
102
 
103
  # Find all timestrings in the log
104
+ # pred_timestrings = self.timestamp_pattern.findall(pred)
105
+ pred_split_log = self.timestamp_pattern.split(pred)
106
+ # ref_timestrings = self.timestamp_pattern.findall(ref)
107
+ ref_split_log = self.timestamp_pattern.split(ref)
108
+
109
+ # This should alwas hold (safety feature)
110
+ # TODO: remove this after testing
111
+ assert(len(pred_split_log) % 2 == len(ref_split_log) % 2 == 1)
112
+
113
+ # One logentry always consists of timestamp + log-message
114
+ pred_logentries = []
115
+ ref_logentries = []
116
+
117
+ # reorganize log into logentry-tuples, consisting of timestamp + log-message
118
+ for i in range(1, len(pred_split_log), 2):
119
+ pred_logentries.append((pred_split_log[i],pred_split_log[i+1]))
120
 
121
+ for i in range(1, len(ref_split_log), 2):
122
+ ref_logentries.append((ref_split_log[i],ref_split_log[i+1]))
123
+
124
+ # The number of logentries of the reference/prediction which has more/less entries/timestamps
125
+ max_logentries = max(len(pred_logentries), len(ref_logentries))
126
+ min_logentries = min(len(pred_logentries), len(ref_logentries))
127
+
128
+
129
+ # Case there are no timestamps in reference and none in prediction
130
+ # we can compute bleu directly from original prediction (ref will be empty, but we offload this to the bleu metric)
131
+ if (len(pred_logentries) == 0 and len(ref_logentries) == 0):
132
+ # TODO: remove this later, for testing purposes only
133
+ assert(pred == "")
134
+ # any sensible log reference is empty if there is no timestamp, hence it suffices to check exact match
135
+ logmsg_score = 100.0 if pred == ref else 0.0
136
+ return 0.3 * 100.0 + 0.7 * logmsg_score
137
 
138
+ # Case one has 0 timestamps, other has >0 timestamps
139
+ if (len(pred_logentries) == 0 or len(ref_logentries) == 0):
140
+ # It is nonsensical to compare something in this case
141
+ return 0.0
142
+
143
+
144
  # replace all digits in the reference timestamp (first timestamp) with '/d' to get
145
  # a regex that describes the format
146
+ pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_logentries[0][0]))
147
 
148
+ matchesPatternScore = 100.0
149
+ monotonicallyIncreasingScore = 100.0
150
+
151
+ # An array to save score per logentry
152
+ logmessage_scores = []
153
+ # TODO: Idea to penalize too long/ short logs-> add the amount of(max_len - min_len) between timestamps times score 0 at the end
154
  # A variable to save the previous timestamp (as datetime obj) to check monotonicity
155
  prev_datetime = None
156
  # Convert matches to datetime objects
157
+ # TODO TODO TODO fix this:
158
+ for i in range(min_logentries):
159
+ ts, pred_lm = pred_logentries[i]
160
+ _, ref_lm = ref_logentries[i]
161
  try:
162
  # Check if the format matches with the format of the first timestamp
163
+ # TODO!! Check this later, maybe it is too restricting for training a llm
164
  matchesPattern = re.fullmatch(pred_timestring_pattern, ts) is not None
165
  # Check if the timestamps are monotonically increasing
166
  cur_datetime = dateutil.parser.parse(ts)
167
  monotonicallyIncreasing = True if prev_datetime == None else prev_datetime <= cur_datetime
168
  prev_datetime = cur_datetime
169
 
170
+ # If one entry doesn't fulfill the matching pattern property or the monotinicity property, set to 0 for whole log
171
+ if (not matchesPattern):
172
+ matchesPatternScore = 0.0
173
+ if (not monotonicallyIncreasing):
174
+ monotonicallyIncreasingScore = 0.0
175
 
176
  except Exception as e:
177
  # e.g. date format not parsable by dateutil.parser
178
+ matchesPatternScore = 0.0
179
+ monotonicallyIncreasingScore = 0.0
180
+
181
+ logmessage_scores.append(sacrebleu.compute(predictions=[pred_lm], references=[ref_lm])["score"])
182
+
183
+ # TODO: remove later. Used only for testing purposes
184
+ assert(len(logmessage_scores) == min_logentries)
185
+ # we aggregate the bleu scores where we weight the difference in logentries with a score of 0
186
+ logmessage_aggregated_score = ((min_logentries / max_logentries) * np.mean(logmessage_scores))
187
  # Correct amt of timestrings, monotonically increasing, consistent + (by dateutil.parser) parsable format
188
+ return 0.2 * monotonicallyIncreasingScore + 0.1 * matchesPatternScore + 0.7 * logmessage_aggregated_score
189
 
190
+ def _compute(self, predictions, references, sacrebleu):
191
  """Returns the scores"""
192
 
193
+ # TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
194
+
195
  t_before_logmetric = time.perf_counter()
196
+ timestamp_score = np.mean([self.getLogMetric(p,r, sacrebleu) for p,r in zip(predictions,references)])
197
  t_after_logmetric = time.perf_counter()
198
 
199
  logmetric_duration = f" {t_after_logmetric - t_before_logmetric:0.10f}"