distinct
Browse files- distinct.py +1 -7
distinct.py
CHANGED
@@ -117,7 +117,6 @@ class distinct(evaluate.Measurement):
|
|
117 |
|
118 |
def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer=Tokenizer13a(), mode="Expectation-Adjusted-Distinct"):
|
119 |
from nltk.util import ngrams
|
120 |
-
from nltk.tokenize import WhitespaceTokenizer
|
121 |
|
122 |
|
123 |
|
@@ -128,9 +127,7 @@ class distinct(evaluate.Measurement):
|
|
128 |
raise Warning("We've detected that both vocab_size and dataForVocabCal are specified. We will use dataForVocabCal.")
|
129 |
elif mode == "Distinct":
|
130 |
pass
|
131 |
-
|
132 |
-
if tokenizer == "white_space":
|
133 |
-
tokenizer = WhitespaceTokenizer()
|
134 |
|
135 |
if mode == "Expectation-Adjusted-Distinct" and dataForVocabCal is not None:
|
136 |
if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
|
@@ -152,10 +149,7 @@ class distinct(evaluate.Measurement):
|
|
152 |
|
153 |
for prediction in predictions:
|
154 |
try:
|
155 |
-
print(prediction)
|
156 |
-
print(tokenizer.tokenize(prediction))
|
157 |
tokens = list(tokenizer.tokenize(prediction))
|
158 |
-
print(tokens)
|
159 |
tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
|
160 |
tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
|
161 |
except Exception as e:
|
|
|
117 |
|
118 |
def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer=Tokenizer13a(), mode="Expectation-Adjusted-Distinct"):
|
119 |
from nltk.util import ngrams
|
|
|
120 |
|
121 |
|
122 |
|
|
|
127 |
raise Warning("We've detected that both vocab_size and dataForVocabCal are specified. We will use dataForVocabCal.")
|
128 |
elif mode == "Distinct":
|
129 |
pass
|
130 |
+
|
|
|
|
|
131 |
|
132 |
if mode == "Expectation-Adjusted-Distinct" and dataForVocabCal is not None:
|
133 |
if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
|
|
|
149 |
|
150 |
for prediction in predictions:
|
151 |
try:
|
|
|
|
|
152 |
tokens = list(tokenizer.tokenize(prediction))
|
|
|
153 |
tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
|
154 |
tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
|
155 |
except Exception as e:
|