import datasets
import evaluate
import numpy as np
from Levenshtein import distance as lev_dist


_DESCRIPTION = """
TokenEditDistance: This is an NLP evaluation metric that records the minimum number of token edits 
(insertions, deletions, and replacements, all weighted equally) to the prediction string in order 
to make it exactly match the reference string. Uses identical logic to Levenshtein Edit Distance, 
except applied to tokens (i.e. individual ints in a list) as opposed to individual characters in a string.
"""

_CITATION = "Man of a thousand and eight names"

_KWARGS_DESCRIPTION = """
TokenEditDistance:

Args:
    predictions: list of predictions to score.
        Each prediction should be tokenized into a list of tokens.
    references: list of references/ground truth output to score against.
        Each reference should be tokenized into a list of tokens.

Returns:
    "avg_token_edit_distance": Float, average Token Edit Distance for all inputted predictions and references
    "token_edit_distances": List[Int], the Token Edit Distance for each inputted prediction and reference

Examples:
    >>> token_edit_distance_metric = datasets.load_metric('Token Edit Distance')
    >>> references = [[15, 4243], [100, 10008]]
    >>> predictions = [[15, 4243], [100, 10009]]
    >>> results = token_edit_distance_metric.compute(predictions=predictions, references=references)
    >>> print(results)
    {'avg_token_edit_distance': 0.5, 'token_edit_distances': array([0. 1.])}
"""


class TokenEditDistance(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": datasets.features.Sequence(datasets.Value("int32")),
                    "references": datasets.features.Sequence(datasets.Value("int32")),
                }
            ),
            codebase_urls=[],
            reference_urls=[],
        )

    def _compute(self, references, predictions):
        if len(predictions) != len(references):
            raise KeyError(
                "Token Edit Distance: Compute Error: Number of predictions does not match number of references."
            )

        edit_dist_arr = np.zeros(len(predictions))

        for i in range(len(edit_dist_arr)):
            if len(predictions[i]) != len(references[i]):
                raise KeyError(
                    "Token Edit Distance: Compute Error: Prediction length does not match reference length for example" +
                    str(i) + " (prediction len: " + str(len(predictions[i])) + ", reference len: " + str(len(references[i])) + ")."
                )

            edit_dist_arr[i] = lev_dist(predictions[i], references[i])

        return {
            "avg_token_edit_distance": np.mean(edit_dist_arr),
            "token_edit_distances": edit_dist_arr,
        }