Spaces:
Sleeping
Sleeping
| """Levenshtein metric file.""" | |
| from __future__ import annotations | |
| from typing import TYPE_CHECKING | |
| import datasets | |
| import evaluate | |
| from Levenshtein import distance | |
| if TYPE_CHECKING: | |
| from collections.abc import Sequence | |
| _CITATION = """\ | |
| @InProceedings{huggingface:levenshtein, | |
| title = {Levenshtein (edit) distance}, | |
| authors={Nathan Fradet}, | |
| year={2024} | |
| } | |
| """ | |
| _DESCRIPTION = """\ | |
| This metrics computes the Levenshtein (edit) distance. | |
| It directly calls the "Levenshtein" package using the ``distance`` method: | |
| https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance | |
| """ | |
| _KWARGS_DESCRIPTION = """ | |
| This metric computes the Levenshtein distance, also commonly called "edit distance". | |
| The Levenshtein distance measures the number of combined editions, deletions and | |
| additions to perform on a string so that it becomes identical to a second one. It is a | |
| popular metric for text similarity. | |
| This module directly calls the | |
| [Levenshtein package](https://github.com/rapidfuzz/Levenshtein) for fast execution | |
| speed. | |
| Args: | |
| predictions: list of prediction strings. | |
| references: list of reference strings. | |
| **kwargs: keyword arguments to pass to the [Levenshtein.distance](https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance) | |
| method. | |
| Returns: | |
| Dictionary mapping to the average Levenshtein distance (lower is better) and the | |
| ratio ([0, 1]) distance (higher is better). | |
| Examples: | |
| >>> levenshtein = evaluate.load("Natooz/Levenshtein") | |
| >>> results = levenshtein.compute( | |
| ... predictions=[ | |
| ... "foo", "baroo" | |
| ... ], | |
| ... references=,[ | |
| ... "foo", "bar1" | |
| ... ], | |
| ... ) | |
| >>> print(results) | |
| {"levenshtein": 1, "levenshtein_ratio": 0.875} | |
| """ | |
| class Levenshtein(evaluate.Metric): | |
| """Module for the ``distance`` method of the "Levenshtein" package.""" | |
| def _info(self) -> evaluate.MetricInfo: | |
| """ | |
| Return the module info. | |
| :return: module info. | |
| """ | |
| return evaluate.MetricInfo( | |
| # This is the description that will appear on the modules page. | |
| module_type="metric", | |
| description=_DESCRIPTION, | |
| citation=_CITATION, | |
| inputs_description=_KWARGS_DESCRIPTION, | |
| # This defines the format of each prediction and reference | |
| features=datasets.Features( | |
| { | |
| "predictions": datasets.Value("string"), | |
| "references": datasets.Value("string"), | |
| } | |
| ), | |
| # Homepage of the module for documentation | |
| homepage="https://huggingface.co/spaces/Natooz/Levenshtein", | |
| # Additional links to the codebase or references | |
| codebase_urls=[ | |
| "https://github.com/rapidfuzz/Levenshtein", | |
| ], | |
| reference_urls=[ | |
| "https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance" | |
| ], | |
| ) | |
| def _compute( | |
| self, | |
| predictions: Sequence[float] | None = None, | |
| references: Sequence[int] | None = None, | |
| **kwargs, | |
| ) -> dict[str, float]: | |
| """ | |
| Return the average Levenshtein (edit) distance. | |
| See the "Levenshtein" PyPi package documentation for the complete usage | |
| information: https://rapidfuzz.github.io/Levenshtein/ | |
| """ | |
| if len(predictions) != len(references): | |
| msg = "The number of predictions must be equal to the number of references." | |
| raise ValueError(msg) | |
| # Compute the distances | |
| results, ratios = [], [] | |
| for prediction, reference in zip(predictions, references): | |
| edit_distance = distance(prediction, reference, **kwargs) | |
| results.append(edit_distance) | |
| ratios.append(edit_distance / (len(prediction) + len(reference))) | |
| # Return average distance and ratio | |
| return { | |
| "levenshtein": sum(results) / len(results), | |
| "levenshtein_ratio": 1 - sum(ratios) / len(ratios), | |
| } | |