lsy641 commited on
Commit
a379e21
1 Parent(s): afbef56

update distinct

Browse files
Files changed (1) hide show
  1. distinct.py +7 -7
distinct.py CHANGED
@@ -146,27 +146,27 @@ class distinct(evaluate.Measurement):
146
  total_tokens = []
147
  total_tokens_2grams = []
148
  total_tokens_3grams = []
 
149
  for prediction in predictions:
150
  if tokenizer == "white_space":
151
  tokens = prediction.split(" ")
152
- tokens_2grams = ngrams(prediction.split(" "), 2, left_pad_symbol='<s>')
153
- tokens_3grams = ngrams(prediction.split(" "), 3, left_pad_symbol='<s>')
154
  else:
155
  try:
156
  tokens = list(tokenizer.tokenize(prediction))
157
- tokens_2grams = ngrams(list(tokenizer.tokenize(prediction)), 2, left_pad_symbol='<s>')
158
- tokens_3grams = ngrams(list(tokenizer.tokenize(prediction)), 3, left_pad_symbol='<s>')
159
-
160
  except Exception as e:
161
  raise e
162
- print(tokens_2grams)
163
  distinct_tokens = distinct_tokens | set(tokens)
164
  distinct_tokens_2grams = distinct_tokens_2grams | set(tokens_2grams)
165
  distinct_tokens_3grams = distinct_tokens_3grams | set(tokens_3grams)
166
  total_tokens.extend(tokens)
167
  total_tokens_2grams.extend(list(tokens_2grams))
168
  total_tokens_3grams.extend(list(tokens_3grams))
169
- print(distinct_tokens_2grams, total_tokens_2grams)
170
  Distinct_1 = len(distinct_tokens)/len(total_tokens)
171
  Distinct_2 = len(distinct_tokens_2grams)/len(total_tokens_2grams)
172
  Distinct_3 = len(distinct_tokens_3grams)/len(total_tokens_3grams)
 
146
  total_tokens = []
147
  total_tokens_2grams = []
148
  total_tokens_3grams = []
149
+
150
  for prediction in predictions:
151
  if tokenizer == "white_space":
152
  tokens = prediction.split(" ")
153
+ tokens_2grams = list(ngrams(prediction.split(" "), 2, pad_left=True, left_pad_symbol='<s>'))
154
+ tokens_3grams = list(ngrams(prediction.split(" "), 3, pad_left=True, left_pad_symbol='<s>'))
155
  else:
156
  try:
157
  tokens = list(tokenizer.tokenize(prediction))
158
+ tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
159
+ tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
 
160
  except Exception as e:
161
  raise e
162
+
163
  distinct_tokens = distinct_tokens | set(tokens)
164
  distinct_tokens_2grams = distinct_tokens_2grams | set(tokens_2grams)
165
  distinct_tokens_3grams = distinct_tokens_3grams | set(tokens_3grams)
166
  total_tokens.extend(tokens)
167
  total_tokens_2grams.extend(list(tokens_2grams))
168
  total_tokens_3grams.extend(list(tokens_3grams))
169
+
170
  Distinct_1 = len(distinct_tokens)/len(total_tokens)
171
  Distinct_2 = len(distinct_tokens_2grams)/len(total_tokens_2grams)
172
  Distinct_3 = len(distinct_tokens_3grams)/len(total_tokens_3grams)