Spaces:

lsy641
/

distinct

Runtime error

App Files Files Community

distinct / distinct.py

lsy641

distinct

c140a95 over 1 year ago

raw

history blame

8.28 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	import evaluate
	import datasets
	from .tokenizer_13a import Tokenizer13a



	_CITATION = """\
	@inproceedings{liu-etal-2022-rethinking,
	title = "Rethinking and Refining the Distinct Metric",
	author = "Liu, Siyang and
	Sabour, Sahand and
	Zheng, Yinhe and
	Ke, Pei and
	Zhu, Xiaoyan and
	Huang, Minlie",
	booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
	year = "2022",
	publisher = "Association for Computational Linguistics",
	url = "https://aclanthology.org/2022.acl-short.86",
	doi = "10.18653/v1/2022.acl-short.86",
	}

	@inproceedings{li-etal-2016-diversity,
	title = "A Diversity-Promoting Objective Function for Neural Conversation Models",
	author = "Li, Jiwei and
	Galley, Michel and
	Brockett, Chris and
	Gao, Jianfeng and
	Dolan, Bill",
	booktitle = "Proceedings of the 2016 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies",
	year = "2016",
	publisher = "Association for Computational Linguistics",
	url = "https://aclanthology.org/N16-1014",
	doi = "10.18653/v1/N16-1014",
	}
	"""
	# ![Comparison between original distinct and and EAD ](https://huggingface.co/spaces/lsy641/distinct/resolve/main/distinct_compare_pic.jpg)

	_DESCRIPTION = """\
	Distinct metric is to calculate corpus-level diversity of language. We provide two versions of distinct score. Expectation-Adjusted-Distinct (EAD) is the default one, which removes
	the biases of the original distinct score on lengthier sentences (see Figure below). Distinct is the original version.

	"""



	_KWARGS_DESCRIPTION = """
	Calculates how good are predictions given some references, using certain scores
	Args:
	predictions: list of sentecnes. Each prediction should be a string.
	Returns:
	Expectation-Adjusted-Distinct
	Distinct-1
	Distinct-2
	Distinct-3
	Examples:
	Examples should be written in doctest format, and should illustrate how
	to use the function.

	>>> my_new_module = evaluate.load("lsy641/distinct")
	>>> results = my_new_module.compute(references=["Hi.", "I'm sorry to hear that", "I don't know"], vocab_size=50257)
	>>> print(results)


	>>> dataset = ["This is my friend jack", "I'm sorry to hear that", "But you know I am the one who always support you", "Welcome to our family"]
	>>> results = my_new_module.compute(references=["Hi.", "I'm sorry to hear that", "I don't know"], dataForVocabCal = dataset)
	>>> print(results)


	>>> results = my_new_module.compute(references=["Hi.", "I'm sorry to hear that", "I don't know"], mode="Distinct")
	>>> print(results)

	"""

	# TODO: Define external resources urls if needed
	BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"


	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class distinct(evaluate.Measurement):

	def _info(self):
	return evaluate.MeasurementInfo(
	# This is the description that will appear on the modules page.
	module_type="measurement",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	# This defines the format of each prediction and reference
	features=datasets.Features({
	'predictions': datasets.Value('string')
	}),
	# Homepage of the module for documentation
	homepage="https://huggingface.co/spaces/lsy641/distinct",
	# Additional links to the codebase or references
	codebase_urls=["https://github.com/lsy641/Expectation-Adjusted-Distinct/tree/main"],
	reference_urls=["https://aclanthology.org/2022.acl-short.86/"]
	)

	def _download_and_prepare(self, dl_manager):
	"""Optional: download external resources useful to compute the scores"""

	def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer=Tokenizer13a(), mode="Expectation-Adjusted-Distinct"):
	from nltk.util import ngrams
	from nltk.tokenize import WhitespaceTokenizer



	"""Returns the scores"""
	if mode == "Expectation-Adjusted-Distinct" and vocab_size is None and dataForVocabCal is None:
	raise ValueError("Either vocab_size or dataForVocabCal needs to be specified when using mode 'Expectation-Adjusted-Distinct'. See https://github.com/lsy641/Expectation-Adjusted-Distinct/blob/main/EAD.ipynb for vocab_size specification. \n Or use mode='Distinct' to get original version of distinct score.")
	elif mode == "Expectation-Adjusted-Distinct" and vocab_size is not None and dataForVocabCal is not None:
	raise Warning("We've detected that both vocab_size and dataForVocabCal are specified. We will use dataForVocabCal.")
	elif mode == "Distinct":
	pass

	if tokenizer == "white_space":
	tokenizer = WhitespaceTokenizer()

	if mode == "Expectation-Adjusted-Distinct" and dataForVocabCal is not None:
	if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
	vocab = set()
	for sentence in dataForVocabCal:
	# if tokenizer == "white_space":
	# vocab = vocab \| set(sentence.split(" "))
	# else:
	vocab = vocab \| set(tokenizer.tokenize(sentence))
	vocab_size = len(vocab)
	else:
	raise TypeError("Argument dataForVocabCal should be a list of strings")
	distinct_tokens = set()
	distinct_tokens_2grams = set()
	distinct_tokens_3grams = set()
	total_tokens = []
	total_tokens_2grams = []
	total_tokens_3grams = []

	for prediction in predictions:
	try:
	tokens = list(tokenizer.tokenize(prediction))
	print(tokens)
	tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
	tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
	except Exception as e:
	raise e

	distinct_tokens = distinct_tokens \| set(tokens)
	distinct_tokens_2grams = distinct_tokens_2grams \| set(tokens_2grams)
	distinct_tokens_3grams = distinct_tokens_3grams \| set(tokens_3grams)
	total_tokens.extend(tokens)
	total_tokens_2grams.extend(list(tokens_2grams))
	total_tokens_3grams.extend(list(tokens_3grams))

	Distinct_1 = len(distinct_tokens)/len(total_tokens)
	Distinct_2 = len(distinct_tokens_2grams)/len(total_tokens_2grams)
	Distinct_3 = len(distinct_tokens_3grams)/len(total_tokens_3grams)
	if mode == "Expectation-Adjusted-Distinct":
	Expectation_Adjusted_Distinct = len(distinct_tokens)/(vocab_size(1-((vocab_size-1)/vocab_size)*len(total_tokens)))
	return {
	"Expectation-Adjusted-Distinct": Expectation_Adjusted_Distinct,
	"Distinct-1": Distinct_1,
	"Distinct-2": Distinct_2,
	"Distinct-3": Distinct_3
	}

	if mode == "Distinct":
	return {
	"Distinct-1": Distinct_1,
	"Distinct-2": Distinct_2,
	"Distinct-3": Distinct_3
	}