# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import evaluate import datasets _CITATION = """\ @inproceedings{liu-etal-2022-rethinking, title = "Rethinking and Refining the Distinct Metric", author = "Liu, Siyang and Sabour, Sahand and Zheng, Yinhe and Ke, Pei and Zhu, Xiaoyan and Huang, Minlie", booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)", year = "2022", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2022.acl-short.86", doi = "10.18653/v1/2022.acl-short.86", } @inproceedings{li-etal-2016-diversity, title = "A Diversity-Promoting Objective Function for Neural Conversation Models", author = "Li, Jiwei and Galley, Michel and Brockett, Chris and Gao, Jianfeng and Dolan, Bill", booktitle = "Proceedings of the 2016 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies", year = "2016", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/N16-1014", doi = "10.18653/v1/N16-1014", } """ _DESCRIPTION = """\ Distinct metric is to calculate corpus-level diversity of language. We provide two versions of distinct score. Expectation-Adjusted-Distinct is the default one, which removes the biases of the original distinct score on lenthier sentences. Distinct is the original version. For the use of Expectation-Adjusted-Distinct, vocab_size is required. Please follow ACL paper https://aclanthology.org/2022.acl-short.86 for motivation and follow the rule of thumb provided by https://github.com/lsy641/Expectation-Adjusted-Distinct/blob/main/EAD.ipynb to determine the vocab_size """ _KWARGS_DESCRIPTION = """ Calculates how good are predictions given some references, using certain scores Args: predictions: list of sentecnes. Each prediction should be a string. Returns: Expectation-Adjusted-Distinct Distinct-1 Distinct-2 Distinct-3 Examples: Examples should be written in doctest format, and should illustrate how to use the function. >>> my_new_module = evaluate.load("distinct") >>> results = my_new_module.compute(references=["Hi.", "I'm sorry to hear that", "I don't know"], vocab_size=50257) >>> print(results) >>> dataset = ["This is my friend jack", "I'm sorry to hear that", "But you know I am the one who always support you", "Welcome to our family"] >>> results = my_new_module.compute(references=["Hi.", "I'm sorry to hear that", "I don't know"], dataForVocabCal = dataset) >>> print(results) >>> results = my_new_module.compute(references=["Hi.", "I'm sorry to hear that", "I don't know"], mode="Distinct") >>> print(results) """ # TODO: Define external resources urls if needed BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class distinct(evaluate.Measurement): def _info(self): # TODO: Specifies the evaluate.EvaluationModuleInfo object return evaluate.MeasurementInfo( # This is the description that will appear on the modules page. module_type="measurement", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features({ 'predictions': datasets.Sequence('string') }), # Homepage of the module for documentation homepage="https://huggingface.co/spaces/lsy641/distinct", # Additional links to the codebase or references codebase_urls=["https://github.com/lsy641/Expectation-Adjusted-Distinct/tree/main"], reference_urls=["https://aclanthology.org/2022.acl-short.86/"] ) def _download_and_prepare(self, dl_manager): """Optional: download external resources useful to compute the scores""" pass def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer="white_space", mode="Expectation-Adjusted-Distinct"): from nltk.util import ngrams """Returns the scores""" if mode == "Expectation-Adjusted-Distinct" and vocab_size is None and dataForVocabCal is None: raise ValueError("Either vocab_size or dataForVocabCal needs to be specified when using mode 'Expectation-Adjusted-Distinct'. See https://github.com/lsy641/Expectation-Adjusted-Distinct/blob/main/EAD.ipynb for vocab_size specification. \n Or use mode='Distinct' to get original version of distinct score.") elif mode == "Expectation-Adjusted-Distinct" and vocab_size is not None and dataForVocabCal is not None: raise Warning("We've detected that both vocab_size and dataForVocabCal are specified. We will use dataForVocabCal.") elif mode == "Distinct": pass if mode == "Expectation-Adjusted-Distinct" and dataForVocabCal is not None: if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str): vocab = set() for sentence in dataForVocabCal: if tokenizer == "white_space": vocab = vocab | set(sentence.split(" ")) else: vocab = vocab | set(tokenizer.tokenize(sentence)) vocab_size = len(vocab) else: raise TypeError("Argument dataForVocabCal should be a list of strings") distinct_tokens = set() distinct_tokens_2grams = set() distinct_tokens_3grams = set() total_tokens = [] total_tokens_2grams = [] total_tokens_3grams = [] for prediction in predictions: if tokenizer == "white_space": tokens = prediction.split(" ") tokens_2grams = ngrams(prediction.split(" "), 2, left_pad_symbol='') tokens_3grams = ngrams(prediction.split(" "), 3, left_pad_symbol='') else: try: tokens = list(tokenizer.tokenize(prediction)) tokens_2grams = ngrams(list(tokenizer.tokenize(prediction)), 2, left_pad_symbol='') tokens_3grams = ngrams(list(tokenizer.tokenize(prediction)), 3, left_pad_symbol='') except Exception as e: raise e distinct_tokens = distinct_tokens | set(tokens) distinct_tokens_2grams = distinct_tokens_2grams | set(tokens_2grams) distinct_tokens_3grams = distinct_tokens_3grams | set(tokens_3grams) total_tokens.extend(tokens) total_tokens_2grams.extend(list(tokens_2grams)) total_tokens_3grams.extend(list(tokens_3grams)) Distinct_1 = len(distinct_tokens)/len(total_tokens) Distinct_2 = len(distinct_tokens_2grams)/len(total_tokens_2grams) Distinct_3 = len(distinct_tokens_3grams)/len(total_tokens_3grams) if mode == "Expectation-Adjusted-Distinct": Expectation_Adjusted_Distinct = len(distinct_tokens)/(vocab_size*(1-((vocab_size-1)/vocab_size)**len(total_tokens))) return { "Expectation-Adjusted-Distinct": Expectation_Adjusted_Distinct, "Distinct-1": Distinct_1, "Distinct-2": Distinct_2, "Distinct-3": Distinct_3 } if mode == "Distinct": return { "Distinct-1": Distinct_1, "Distinct-2": Distinct_2, "Distinct-3": Distinct_3 }