Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: Text Segmentation Metrics | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: Edward Loper <edloper@gmail.com> | |
# Steven Bird <stevenbird1@gmail.com> | |
# David Doukhan <david.doukhan@gmail.com> | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
""" | |
Text Segmentation Metrics | |
1. Windowdiff | |
Pevzner, L., and Hearst, M., A Critique and Improvement of | |
an Evaluation Metric for Text Segmentation, | |
Computational Linguistics 28, 19-36 | |
2. Generalized Hamming Distance | |
Bookstein A., Kulyukin V.A., Raita T. | |
Generalized Hamming Distance | |
Information Retrieval 5, 2002, pp 353-375 | |
Baseline implementation in C++ | |
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html | |
Study describing benefits of Generalized Hamming Distance Versus | |
WindowDiff for evaluating text segmentation tasks | |
Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ? | |
TALN 2009 | |
3. Pk text segmentation metric | |
Beeferman D., Berger A., Lafferty J. (1999) | |
Statistical Models for Text Segmentation | |
Machine Learning, 34, 177-210 | |
""" | |
try: | |
import numpy as np | |
except ImportError: | |
pass | |
def windowdiff(seg1, seg2, k, boundary="1", weighted=False): | |
""" | |
Compute the windowdiff score for a pair of segmentations. A | |
segmentation is any sequence over a vocabulary of two items | |
(e.g. "0", "1"), where the specified boundary value is used to | |
mark the edge of a segmentation. | |
>>> s1 = "000100000010" | |
>>> s2 = "000010000100" | |
>>> s3 = "100000010000" | |
>>> '%.2f' % windowdiff(s1, s1, 3) | |
'0.00' | |
>>> '%.2f' % windowdiff(s1, s2, 3) | |
'0.30' | |
>>> '%.2f' % windowdiff(s2, s3, 3) | |
'0.80' | |
:param seg1: a segmentation | |
:type seg1: str or list | |
:param seg2: a segmentation | |
:type seg2: str or list | |
:param k: window width | |
:type k: int | |
:param boundary: boundary value | |
:type boundary: str or int or bool | |
:param weighted: use the weighted variant of windowdiff | |
:type weighted: boolean | |
:rtype: float | |
""" | |
if len(seg1) != len(seg2): | |
raise ValueError("Segmentations have unequal length") | |
if k > len(seg1): | |
raise ValueError( | |
"Window width k should be smaller or equal than segmentation lengths" | |
) | |
wd = 0 | |
for i in range(len(seg1) - k + 1): | |
ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary)) | |
if weighted: | |
wd += ndiff | |
else: | |
wd += min(1, ndiff) | |
return wd / (len(seg1) - k + 1.0) | |
# Generalized Hamming Distance | |
def _init_mat(nrows, ncols, ins_cost, del_cost): | |
mat = np.empty((nrows, ncols)) | |
mat[0, :] = ins_cost * np.arange(ncols) | |
mat[:, 0] = del_cost * np.arange(nrows) | |
return mat | |
def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff): | |
for i, rowi in enumerate(rowv): | |
for j, colj in enumerate(colv): | |
shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j] | |
if rowi == colj: | |
# boundaries are at the same location, no transformation required | |
tcost = mat[i, j] | |
elif rowi > colj: | |
# boundary match through a deletion | |
tcost = del_cost + mat[i, j + 1] | |
else: | |
# boundary match through an insertion | |
tcost = ins_cost + mat[i + 1, j] | |
mat[i + 1, j + 1] = min(tcost, shift_cost) | |
def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"): | |
""" | |
Compute the Generalized Hamming Distance for a reference and a hypothetical | |
segmentation, corresponding to the cost related to the transformation | |
of the hypothetical segmentation into the reference segmentation | |
through boundary insertion, deletion and shift operations. | |
A segmentation is any sequence over a vocabulary of two items | |
(e.g. "0", "1"), where the specified boundary value is used to | |
mark the edge of a segmentation. | |
Recommended parameter values are a shift_cost_coeff of 2. | |
Associated with a ins_cost, and del_cost equal to the mean segment | |
length in the reference segmentation. | |
>>> # Same examples as Kulyukin C++ implementation | |
>>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5) | |
0.5 | |
>>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5) | |
2.0 | |
>>> ghd('011', '110', 1.0, 1.0, 0.5) | |
1.0 | |
>>> ghd('1', '0', 1.0, 1.0, 0.5) | |
1.0 | |
>>> ghd('111', '000', 1.0, 1.0, 0.5) | |
3.0 | |
>>> ghd('000', '111', 1.0, 2.0, 0.5) | |
6.0 | |
:param ref: the reference segmentation | |
:type ref: str or list | |
:param hyp: the hypothetical segmentation | |
:type hyp: str or list | |
:param ins_cost: insertion cost | |
:type ins_cost: float | |
:param del_cost: deletion cost | |
:type del_cost: float | |
:param shift_cost_coeff: constant used to compute the cost of a shift. | |
``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j`` | |
are the positions indicating the shift | |
:type shift_cost_coeff: float | |
:param boundary: boundary value | |
:type boundary: str or int or bool | |
:rtype: float | |
""" | |
ref_idx = [i for (i, val) in enumerate(ref) if val == boundary] | |
hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary] | |
nref_bound = len(ref_idx) | |
nhyp_bound = len(hyp_idx) | |
if nref_bound == 0 and nhyp_bound == 0: | |
return 0.0 | |
elif nref_bound > 0 and nhyp_bound == 0: | |
return nref_bound * ins_cost | |
elif nref_bound == 0 and nhyp_bound > 0: | |
return nhyp_bound * del_cost | |
mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost) | |
_ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff) | |
return mat[-1, -1] | |
# Beeferman's Pk text segmentation evaluation metric | |
def pk(ref, hyp, k=None, boundary="1"): | |
""" | |
Compute the Pk metric for a pair of segmentations A segmentation | |
is any sequence over a vocabulary of two items (e.g. "0", "1"), | |
where the specified boundary value is used to mark the edge of a | |
segmentation. | |
>>> '%.2f' % pk('0100'*100, '1'*400, 2) | |
'0.50' | |
>>> '%.2f' % pk('0100'*100, '0'*400, 2) | |
'0.50' | |
>>> '%.2f' % pk('0100'*100, '0100'*100, 2) | |
'0.00' | |
:param ref: the reference segmentation | |
:type ref: str or list | |
:param hyp: the segmentation to evaluate | |
:type hyp: str or list | |
:param k: window size, if None, set to half of the average reference segment length | |
:type boundary: str or int or bool | |
:param boundary: boundary value | |
:type boundary: str or int or bool | |
:rtype: float | |
""" | |
if k is None: | |
k = int(round(len(ref) / (ref.count(boundary) * 2.0))) | |
err = 0 | |
for i in range(len(ref) - k + 1): | |
r = ref[i : i + k].count(boundary) > 0 | |
h = hyp[i : i + k].count(boundary) > 0 | |
if r != h: | |
err += 1 | |
return err / (len(ref) - k + 1.0) | |