Spaces:

NathanFradet
/

levenshtein

Sleeping

levenshtein / levenshtein.py

Nathan

Upload 5 files

814113b verified about 1 year ago

4.2 kB

	"""Levenshtein metric file."""

	from __future__ import annotations

	from typing import TYPE_CHECKING

	import datasets
	import evaluate

	from Levenshtein import distance

	if TYPE_CHECKING:
	from collections.abc import Sequence

	_CITATION = """\
	@InProceedings{huggingface:levenshtein,
	title = {Levenshtein (edit) distance},
	authors={Nathan Fradet},
	year={2024}
	}
	"""

	_DESCRIPTION = """\
	This metrics computes the Levenshtein (edit) distance.
	It directly calls the "Levenshtein" package using the ``distance`` method:
	https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance
	"""


	_KWARGS_DESCRIPTION = """
	This metric computes the Levenshtein distance, also commonly called "edit distance".
	The Levenshtein distance measures the number of combined editions, deletions and
	additions to perform on a string so that it becomes identical to a second one. It is a
	popular metric for text similarity.
	This module directly calls the
	[Levenshtein package](https://github.com/rapidfuzz/Levenshtein) for fast execution
	speed.

	Args:
	predictions: list of prediction strings.
	references: list of reference strings.
	**kwargs: keyword arguments to pass to the [Levenshtein.distance](https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance)
	method.
	Returns:
	Dictionary mapping to the average Levenshtein distance (lower is better) and the
	ratio ([0, 1]) distance (higher is better).
	Examples:
	>>> levenshtein = evaluate.load("Natooz/Levenshtein")
	>>> results = levenshtein.compute(
	... predictions=[
	... "foo", "baroo"
	... ],
	... references=,[
	... "foo", "bar1"
	... ],
	... )
	>>> print(results)
	{"levenshtein": 1, "levenshtein_ratio": 0.875}
	"""


	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class Levenshtein(evaluate.Metric):
	"""Module for the ``distance`` method of the "Levenshtein" package."""

	def _info(self) -> evaluate.MetricInfo:
	"""
	Return the module info.

	:return: module info.
	"""
	return evaluate.MetricInfo(
	# This is the description that will appear on the modules page.
	module_type="metric",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	# This defines the format of each prediction and reference
	features=datasets.Features(
	{
	"predictions": datasets.Value("string"),
	"references": datasets.Value("string"),
	}
	),
	# Homepage of the module for documentation
	homepage="https://huggingface.co/spaces/Natooz/Levenshtein",
	# Additional links to the codebase or references
	codebase_urls=[
	"https://github.com/rapidfuzz/Levenshtein",
	],
	reference_urls=[
	"https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance"
	],
	)

	def _compute(
	self,
	predictions: Sequence[float] \| None = None,
	references: Sequence[int] \| None = None,
	**kwargs,
	) -> dict[str, float]:
	"""
	Return the average Levenshtein (edit) distance.

	See the "Levenshtein" PyPi package documentation for the complete usage
	information: https://rapidfuzz.github.io/Levenshtein/
	"""
	if len(predictions) != len(references):
	msg = "The number of predictions must be equal to the number of references."
	raise ValueError(msg)

	# Compute the distances
	results, ratios = [], []
	for prediction, reference in zip(predictions, references):
	edit_distance = distance(prediction, reference, **kwargs)
	results.append(edit_distance)
	ratios.append(edit_distance / (len(prediction) + len(reference)))

	# Return average distance and ratio
	return {
	"levenshtein": sum(results) / len(results),
	"levenshtein_ratio": 1 - sum(ratios) / len(ratios),
	}