You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
233 lines
7.0 KiB
Python
233 lines
7.0 KiB
Python
# Natural Language Toolkit: Text Segmentation Metrics
|
|
#
|
|
# Copyright (C) 2001-2020 NLTK Project
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
# Steven Bird <stevenbird1@gmail.com>
|
|
# David Doukhan <david.doukhan@gmail.com>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
|
|
"""
|
|
Text Segmentation Metrics
|
|
|
|
1. Windowdiff
|
|
|
|
Pevzner, L., and Hearst, M., A Critique and Improvement of
|
|
an Evaluation Metric for Text Segmentation,
|
|
Computational Linguistics 28, 19-36
|
|
|
|
|
|
2. Generalized Hamming Distance
|
|
|
|
Bookstein A., Kulyukin V.A., Raita T.
|
|
Generalized Hamming Distance
|
|
Information Retrieval 5, 2002, pp 353-375
|
|
|
|
Baseline implementation in C++
|
|
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
|
|
|
|
Study describing benefits of Generalized Hamming Distance Versus
|
|
WindowDiff for evaluating text segmentation tasks
|
|
Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ?
|
|
TALN 2009
|
|
|
|
|
|
3. Pk text segmentation metric
|
|
|
|
Beeferman D., Berger A., Lafferty J. (1999)
|
|
Statistical Models for Text Segmentation
|
|
Machine Learning, 34, 177-210
|
|
"""
|
|
|
|
try:
|
|
import numpy as np
|
|
except ImportError:
|
|
pass
|
|
|
|
|
|
def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
|
|
"""
|
|
Compute the windowdiff score for a pair of segmentations. A
|
|
segmentation is any sequence over a vocabulary of two items
|
|
(e.g. "0", "1"), where the specified boundary value is used to
|
|
mark the edge of a segmentation.
|
|
|
|
>>> s1 = "000100000010"
|
|
>>> s2 = "000010000100"
|
|
>>> s3 = "100000010000"
|
|
>>> '%.2f' % windowdiff(s1, s1, 3)
|
|
'0.00'
|
|
>>> '%.2f' % windowdiff(s1, s2, 3)
|
|
'0.30'
|
|
>>> '%.2f' % windowdiff(s2, s3, 3)
|
|
'0.80'
|
|
|
|
:param seg1: a segmentation
|
|
:type seg1: str or list
|
|
:param seg2: a segmentation
|
|
:type seg2: str or list
|
|
:param k: window width
|
|
:type k: int
|
|
:param boundary: boundary value
|
|
:type boundary: str or int or bool
|
|
:param weighted: use the weighted variant of windowdiff
|
|
:type weighted: boolean
|
|
:rtype: float
|
|
"""
|
|
|
|
if len(seg1) != len(seg2):
|
|
raise ValueError("Segmentations have unequal length")
|
|
if k > len(seg1):
|
|
raise ValueError(
|
|
"Window width k should be smaller or equal than segmentation lengths"
|
|
)
|
|
wd = 0
|
|
for i in range(len(seg1) - k + 1):
|
|
ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary))
|
|
if weighted:
|
|
wd += ndiff
|
|
else:
|
|
wd += min(1, ndiff)
|
|
return wd / (len(seg1) - k + 1.0)
|
|
|
|
|
|
# Generalized Hamming Distance
|
|
|
|
|
|
def _init_mat(nrows, ncols, ins_cost, del_cost):
|
|
mat = np.empty((nrows, ncols))
|
|
mat[0, :] = ins_cost * np.arange(ncols)
|
|
mat[:, 0] = del_cost * np.arange(nrows)
|
|
return mat
|
|
|
|
|
|
def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
|
|
for i, rowi in enumerate(rowv):
|
|
for j, colj in enumerate(colv):
|
|
shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
|
|
if rowi == colj:
|
|
# boundaries are at the same location, no transformation required
|
|
tcost = mat[i, j]
|
|
elif rowi > colj:
|
|
# boundary match through a deletion
|
|
tcost = del_cost + mat[i, j + 1]
|
|
else:
|
|
# boundary match through an insertion
|
|
tcost = ins_cost + mat[i + 1, j]
|
|
mat[i + 1, j + 1] = min(tcost, shift_cost)
|
|
|
|
|
|
def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"):
|
|
"""
|
|
Compute the Generalized Hamming Distance for a reference and a hypothetical
|
|
segmentation, corresponding to the cost related to the transformation
|
|
of the hypothetical segmentation into the reference segmentation
|
|
through boundary insertion, deletion and shift operations.
|
|
|
|
A segmentation is any sequence over a vocabulary of two items
|
|
(e.g. "0", "1"), where the specified boundary value is used to
|
|
mark the edge of a segmentation.
|
|
|
|
Recommended parameter values are a shift_cost_coeff of 2.
|
|
Associated with a ins_cost, and del_cost equal to the mean segment
|
|
length in the reference segmentation.
|
|
|
|
>>> # Same examples as Kulyukin C++ implementation
|
|
>>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
|
|
0.5
|
|
>>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
|
|
2.0
|
|
>>> ghd('011', '110', 1.0, 1.0, 0.5)
|
|
1.0
|
|
>>> ghd('1', '0', 1.0, 1.0, 0.5)
|
|
1.0
|
|
>>> ghd('111', '000', 1.0, 1.0, 0.5)
|
|
3.0
|
|
>>> ghd('000', '111', 1.0, 2.0, 0.5)
|
|
6.0
|
|
|
|
:param ref: the reference segmentation
|
|
:type ref: str or list
|
|
:param hyp: the hypothetical segmentation
|
|
:type hyp: str or list
|
|
:param ins_cost: insertion cost
|
|
:type ins_cost: float
|
|
:param del_cost: deletion cost
|
|
:type del_cost: float
|
|
:param shift_cost_coeff: constant used to compute the cost of a shift.
|
|
shift cost = shift_cost_coeff * |i - j| where i and j are
|
|
the positions indicating the shift
|
|
:type shift_cost_coeff: float
|
|
:param boundary: boundary value
|
|
:type boundary: str or int or bool
|
|
:rtype: float
|
|
"""
|
|
|
|
ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
|
|
hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
|
|
|
|
nref_bound = len(ref_idx)
|
|
nhyp_bound = len(hyp_idx)
|
|
|
|
if nref_bound == 0 and nhyp_bound == 0:
|
|
return 0.0
|
|
elif nref_bound > 0 and nhyp_bound == 0:
|
|
return nref_bound * ins_cost
|
|
elif nref_bound == 0 and nhyp_bound > 0:
|
|
return nhyp_bound * del_cost
|
|
|
|
mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
|
|
_ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
|
|
return mat[-1, -1]
|
|
|
|
|
|
# Beeferman's Pk text segmentation evaluation metric
|
|
|
|
|
|
def pk(ref, hyp, k=None, boundary="1"):
|
|
"""
|
|
Compute the Pk metric for a pair of segmentations A segmentation
|
|
is any sequence over a vocabulary of two items (e.g. "0", "1"),
|
|
where the specified boundary value is used to mark the edge of a
|
|
segmentation.
|
|
|
|
>>> '%.2f' % pk('0100'*100, '1'*400, 2)
|
|
'0.50'
|
|
>>> '%.2f' % pk('0100'*100, '0'*400, 2)
|
|
'0.50'
|
|
>>> '%.2f' % pk('0100'*100, '0100'*100, 2)
|
|
'0.00'
|
|
|
|
:param ref: the reference segmentation
|
|
:type ref: str or list
|
|
:param hyp: the segmentation to evaluate
|
|
:type hyp: str or list
|
|
:param k: window size, if None, set to half of the average reference segment length
|
|
:type boundary: str or int or bool
|
|
:param boundary: boundary value
|
|
:type boundary: str or int or bool
|
|
:rtype: float
|
|
"""
|
|
|
|
if k is None:
|
|
k = int(round(len(ref) / (ref.count(boundary) * 2.0)))
|
|
|
|
err = 0
|
|
for i in range(len(ref) - k + 1):
|
|
r = ref[i : i + k].count(boundary) > 0
|
|
h = hyp[i : i + k].count(boundary) > 0
|
|
if r != h:
|
|
err += 1
|
|
return err / (len(ref) - k + 1.0)
|
|
|
|
|
|
# skip doctests if numpy is not installed
|
|
def setup_module(module):
|
|
from nose import SkipTest
|
|
|
|
try:
|
|
import numpy
|
|
except ImportError:
|
|
raise SkipTest("numpy is required for nltk.metrics.segmentation")
|