You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
647 lines
20 KiB
Python
647 lines
20 KiB
Python
# Natural Language Toolkit: Chunk format conversions
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
from __future__ import print_function, unicode_literals, division
|
|
|
|
import re
|
|
|
|
from nltk.tree import Tree
|
|
from nltk.tag.mapping import map_tag
|
|
from nltk.tag.util import str2tuple
|
|
from nltk.compat import python_2_unicode_compatible
|
|
|
|
##//////////////////////////////////////////////////////
|
|
## EVALUATION
|
|
##//////////////////////////////////////////////////////
|
|
|
|
from nltk.metrics import accuracy as _accuracy
|
|
|
|
|
|
def accuracy(chunker, gold):
|
|
"""
|
|
Score the accuracy of the chunker against the gold standard.
|
|
Strip the chunk information from the gold standard and rechunk it using
|
|
the chunker, then compute the accuracy score.
|
|
|
|
:type chunker: ChunkParserI
|
|
:param chunker: The chunker being evaluated.
|
|
:type gold: tree
|
|
:param gold: The chunk structures to score the chunker on.
|
|
:rtype: float
|
|
"""
|
|
|
|
gold_tags = []
|
|
test_tags = []
|
|
for gold_tree in gold:
|
|
test_tree = chunker.parse(gold_tree.flatten())
|
|
gold_tags += tree2conlltags(gold_tree)
|
|
test_tags += tree2conlltags(test_tree)
|
|
|
|
# print 'GOLD:', gold_tags[:50]
|
|
# print 'TEST:', test_tags[:50]
|
|
return _accuracy(gold_tags, test_tags)
|
|
|
|
|
|
# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
|
|
# -- statistics are evaluated only on demand, instead of at every sentence evaluation
|
|
#
|
|
# SB: use nltk.metrics for precision/recall scoring?
|
|
#
|
|
class ChunkScore(object):
|
|
"""
|
|
A utility class for scoring chunk parsers. ``ChunkScore`` can
|
|
evaluate a chunk parser's output, based on a number of statistics
|
|
(precision, recall, f-measure, misssed chunks, incorrect chunks).
|
|
It can also combine the scores from the parsing of multiple texts;
|
|
this makes it significantly easier to evaluate a chunk parser that
|
|
operates one sentence at a time.
|
|
|
|
Texts are evaluated with the ``score`` method. The results of
|
|
evaluation can be accessed via a number of accessor methods, such
|
|
as ``precision`` and ``f_measure``. A typical use of the
|
|
``ChunkScore`` class is::
|
|
|
|
>>> chunkscore = ChunkScore() # doctest: +SKIP
|
|
>>> for correct in correct_sentences: # doctest: +SKIP
|
|
... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
|
|
... chunkscore.score(correct, guess) # doctest: +SKIP
|
|
>>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
|
|
F Measure: 0.823
|
|
|
|
:ivar kwargs: Keyword arguments:
|
|
|
|
- max_tp_examples: The maximum number actual examples of true
|
|
positives to record. This affects the ``correct`` member
|
|
function: ``correct`` will not return more than this number
|
|
of true positive examples. This does *not* affect any of
|
|
the numerical metrics (precision, recall, or f-measure)
|
|
|
|
- max_fp_examples: The maximum number actual examples of false
|
|
positives to record. This affects the ``incorrect`` member
|
|
function and the ``guessed`` member function: ``incorrect``
|
|
will not return more than this number of examples, and
|
|
``guessed`` will not return more than this number of true
|
|
positive examples. This does *not* affect any of the
|
|
numerical metrics (precision, recall, or f-measure)
|
|
|
|
- max_fn_examples: The maximum number actual examples of false
|
|
negatives to record. This affects the ``missed`` member
|
|
function and the ``correct`` member function: ``missed``
|
|
will not return more than this number of examples, and
|
|
``correct`` will not return more than this number of true
|
|
negative examples. This does *not* affect any of the
|
|
numerical metrics (precision, recall, or f-measure)
|
|
|
|
- chunk_label: A regular expression indicating which chunks
|
|
should be compared. Defaults to ``'.*'`` (i.e., all chunks).
|
|
|
|
:type _tp: list(Token)
|
|
:ivar _tp: List of true positives
|
|
:type _fp: list(Token)
|
|
:ivar _fp: List of false positives
|
|
:type _fn: list(Token)
|
|
:ivar _fn: List of false negatives
|
|
|
|
:type _tp_num: int
|
|
:ivar _tp_num: Number of true positives
|
|
:type _fp_num: int
|
|
:ivar _fp_num: Number of false positives
|
|
:type _fn_num: int
|
|
:ivar _fn_num: Number of false negatives.
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
self._correct = set()
|
|
self._guessed = set()
|
|
self._tp = set()
|
|
self._fp = set()
|
|
self._fn = set()
|
|
self._max_tp = kwargs.get('max_tp_examples', 100)
|
|
self._max_fp = kwargs.get('max_fp_examples', 100)
|
|
self._max_fn = kwargs.get('max_fn_examples', 100)
|
|
self._chunk_label = kwargs.get('chunk_label', '.*')
|
|
self._tp_num = 0
|
|
self._fp_num = 0
|
|
self._fn_num = 0
|
|
self._count = 0
|
|
self._tags_correct = 0.0
|
|
self._tags_total = 0.0
|
|
|
|
self._measuresNeedUpdate = False
|
|
|
|
def _updateMeasures(self):
|
|
if self._measuresNeedUpdate:
|
|
self._tp = self._guessed & self._correct
|
|
self._fn = self._correct - self._guessed
|
|
self._fp = self._guessed - self._correct
|
|
self._tp_num = len(self._tp)
|
|
self._fp_num = len(self._fp)
|
|
self._fn_num = len(self._fn)
|
|
self._measuresNeedUpdate = False
|
|
|
|
def score(self, correct, guessed):
|
|
"""
|
|
Given a correctly chunked sentence, score another chunked
|
|
version of the same sentence.
|
|
|
|
:type correct: chunk structure
|
|
:param correct: The known-correct ("gold standard") chunked
|
|
sentence.
|
|
:type guessed: chunk structure
|
|
:param guessed: The chunked sentence to be scored.
|
|
"""
|
|
self._correct |= _chunksets(correct, self._count, self._chunk_label)
|
|
self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
|
|
self._count += 1
|
|
self._measuresNeedUpdate = True
|
|
# Keep track of per-tag accuracy (if possible)
|
|
try:
|
|
correct_tags = tree2conlltags(correct)
|
|
guessed_tags = tree2conlltags(guessed)
|
|
except ValueError:
|
|
# This exception case is for nested chunk structures,
|
|
# where tree2conlltags will fail with a ValueError: "Tree
|
|
# is too deeply nested to be printed in CoNLL format."
|
|
correct_tags = guessed_tags = ()
|
|
self._tags_total += len(correct_tags)
|
|
self._tags_correct += sum(
|
|
1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
|
|
)
|
|
|
|
def accuracy(self):
|
|
"""
|
|
Return the overall tag-based accuracy for all text that have
|
|
been scored by this ``ChunkScore``, using the IOB (conll2000)
|
|
tag encoding.
|
|
|
|
:rtype: float
|
|
"""
|
|
if self._tags_total == 0:
|
|
return 1
|
|
return self._tags_correct / self._tags_total
|
|
|
|
def precision(self):
|
|
"""
|
|
Return the overall precision for all texts that have been
|
|
scored by this ``ChunkScore``.
|
|
|
|
:rtype: float
|
|
"""
|
|
self._updateMeasures()
|
|
div = self._tp_num + self._fp_num
|
|
if div == 0:
|
|
return 0
|
|
else:
|
|
return self._tp_num / div
|
|
|
|
def recall(self):
|
|
"""
|
|
Return the overall recall for all texts that have been
|
|
scored by this ``ChunkScore``.
|
|
|
|
:rtype: float
|
|
"""
|
|
self._updateMeasures()
|
|
div = self._tp_num + self._fn_num
|
|
if div == 0:
|
|
return 0
|
|
else:
|
|
return self._tp_num / div
|
|
|
|
def f_measure(self, alpha=0.5):
|
|
"""
|
|
Return the overall F measure for all texts that have been
|
|
scored by this ``ChunkScore``.
|
|
|
|
:param alpha: the relative weighting of precision and recall.
|
|
Larger alpha biases the score towards the precision value,
|
|
while smaller alpha biases the score towards the recall
|
|
value. ``alpha`` should have a value in the range [0,1].
|
|
:type alpha: float
|
|
:rtype: float
|
|
"""
|
|
self._updateMeasures()
|
|
p = self.precision()
|
|
r = self.recall()
|
|
if p == 0 or r == 0: # what if alpha is 0 or 1?
|
|
return 0
|
|
return 1 / (alpha / p + (1 - alpha) / r)
|
|
|
|
def missed(self):
|
|
"""
|
|
Return the chunks which were included in the
|
|
correct chunk structures, but not in the guessed chunk
|
|
structures, listed in input order.
|
|
|
|
:rtype: list of chunks
|
|
"""
|
|
self._updateMeasures()
|
|
chunks = list(self._fn)
|
|
return [c[1] for c in chunks] # discard position information
|
|
|
|
def incorrect(self):
|
|
"""
|
|
Return the chunks which were included in the guessed chunk structures,
|
|
but not in the correct chunk structures, listed in input order.
|
|
|
|
:rtype: list of chunks
|
|
"""
|
|
self._updateMeasures()
|
|
chunks = list(self._fp)
|
|
return [c[1] for c in chunks] # discard position information
|
|
|
|
def correct(self):
|
|
"""
|
|
Return the chunks which were included in the correct
|
|
chunk structures, listed in input order.
|
|
|
|
:rtype: list of chunks
|
|
"""
|
|
chunks = list(self._correct)
|
|
return [c[1] for c in chunks] # discard position information
|
|
|
|
def guessed(self):
|
|
"""
|
|
Return the chunks which were included in the guessed
|
|
chunk structures, listed in input order.
|
|
|
|
:rtype: list of chunks
|
|
"""
|
|
chunks = list(self._guessed)
|
|
return [c[1] for c in chunks] # discard position information
|
|
|
|
def __len__(self):
|
|
self._updateMeasures()
|
|
return self._tp_num + self._fn_num
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a concise representation of this ``ChunkScoring``.
|
|
|
|
:rtype: str
|
|
"""
|
|
return '<ChunkScoring of ' + repr(len(self)) + ' chunks>'
|
|
|
|
def __str__(self):
|
|
"""
|
|
Return a verbose representation of this ``ChunkScoring``.
|
|
This representation includes the precision, recall, and
|
|
f-measure scores. For other information about the score,
|
|
use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
|
|
|
|
:rtype: str
|
|
"""
|
|
return (
|
|
"ChunkParse score:\n"
|
|
+ (" IOB Accuracy: {:5.1f}%%\n".format(self.accuracy() * 100))
|
|
+ (" Precision: {:5.1f}%%\n".format(self.precision() * 100))
|
|
+ (" Recall: {:5.1f}%%\n".format(self.recall() * 100))
|
|
+ (" F-Measure: {:5.1f}%%".format(self.f_measure() * 100))
|
|
)
|
|
|
|
|
|
# extract chunks, and assign unique id, the absolute position of
|
|
# the first word of the chunk
|
|
def _chunksets(t, count, chunk_label):
|
|
pos = 0
|
|
chunks = []
|
|
for child in t:
|
|
if isinstance(child, Tree):
|
|
if re.match(chunk_label, child.label()):
|
|
chunks.append(((count, pos), child.freeze()))
|
|
pos += len(child.leaves())
|
|
else:
|
|
pos += 1
|
|
return set(chunks)
|
|
|
|
|
|
def tagstr2tree(
|
|
s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None
|
|
):
|
|
"""
|
|
Divide a string of bracketted tagged text into
|
|
chunks and unchunked tokens, and produce a Tree.
|
|
Chunks are marked by square brackets (``[...]``). Words are
|
|
delimited by whitespace, and each word should have the form
|
|
``text/tag``. Words that do not contain a slash are
|
|
assigned a ``tag`` of None.
|
|
|
|
:param s: The string to be converted
|
|
:type s: str
|
|
:param chunk_label: The label to use for chunk nodes
|
|
:type chunk_label: str
|
|
:param root_label: The label to use for the root of the tree
|
|
:type root_label: str
|
|
:rtype: Tree
|
|
"""
|
|
|
|
WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')
|
|
|
|
stack = [Tree(root_label, [])]
|
|
for match in WORD_OR_BRACKET.finditer(s):
|
|
text = match.group()
|
|
if text[0] == '[':
|
|
if len(stack) != 1:
|
|
raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
|
|
chunk = Tree(chunk_label, [])
|
|
stack[-1].append(chunk)
|
|
stack.append(chunk)
|
|
elif text[0] == ']':
|
|
if len(stack) != 2:
|
|
raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
|
|
stack.pop()
|
|
else:
|
|
if sep is None:
|
|
stack[-1].append(text)
|
|
else:
|
|
word, tag = str2tuple(text, sep)
|
|
if source_tagset and target_tagset:
|
|
tag = map_tag(source_tagset, target_tagset, tag)
|
|
stack[-1].append((word, tag))
|
|
|
|
if len(stack) != 1:
|
|
raise ValueError('Expected ] at char {:d}'.format(len(s)))
|
|
return stack[0]
|
|
|
|
|
|
### CONLL
|
|
|
|
_LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')
|
|
|
|
|
|
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
|
|
"""
|
|
Return a chunk structure for a single sentence
|
|
encoded in the given CONLL 2000 style string.
|
|
This function converts a CoNLL IOB string into a tree.
|
|
It uses the specified chunk types
|
|
(defaults to NP, PP and VP), and creates a tree rooted at a node
|
|
labeled S (by default).
|
|
|
|
:param s: The CoNLL string to be converted.
|
|
:type s: str
|
|
:param chunk_types: The chunk types to be converted.
|
|
:type chunk_types: tuple
|
|
:param root_label: The node label to use for the root.
|
|
:type root_label: str
|
|
:rtype: Tree
|
|
"""
|
|
|
|
stack = [Tree(root_label, [])]
|
|
|
|
for lineno, line in enumerate(s.split('\n')):
|
|
if not line.strip():
|
|
continue
|
|
|
|
# Decode the line.
|
|
match = _LINE_RE.match(line)
|
|
if match is None:
|
|
raise ValueError('Error on line {:d}'.format(lineno))
|
|
(word, tag, state, chunk_type) = match.groups()
|
|
|
|
# If it's a chunk type we don't care about, treat it as O.
|
|
if chunk_types is not None and chunk_type not in chunk_types:
|
|
state = 'O'
|
|
|
|
# For "Begin"/"Outside", finish any completed chunks -
|
|
# also do so for "Inside" which don't match the previous token.
|
|
mismatch_I = state == 'I' and chunk_type != stack[-1].label()
|
|
if state in 'BO' or mismatch_I:
|
|
if len(stack) == 2:
|
|
stack.pop()
|
|
|
|
# For "Begin", start a new chunk.
|
|
if state == 'B' or mismatch_I:
|
|
chunk = Tree(chunk_type, [])
|
|
stack[-1].append(chunk)
|
|
stack.append(chunk)
|
|
|
|
# Add the new word token.
|
|
stack[-1].append((word, tag))
|
|
|
|
return stack[0]
|
|
|
|
|
|
def tree2conlltags(t):
|
|
"""
|
|
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
|
|
Convert a tree to the CoNLL IOB tag format.
|
|
|
|
:param t: The tree to be converted.
|
|
:type t: Tree
|
|
:rtype: list(tuple)
|
|
"""
|
|
|
|
tags = []
|
|
for child in t:
|
|
try:
|
|
category = child.label()
|
|
prefix = "B-"
|
|
for contents in child:
|
|
if isinstance(contents, Tree):
|
|
raise ValueError(
|
|
"Tree is too deeply nested to be printed in CoNLL format"
|
|
)
|
|
tags.append((contents[0], contents[1], prefix + category))
|
|
prefix = "I-"
|
|
except AttributeError:
|
|
tags.append((child[0], child[1], "O"))
|
|
return tags
|
|
|
|
|
|
def conlltags2tree(
|
|
sentence, chunk_types=('NP', 'PP', 'VP'), root_label='S', strict=False
|
|
):
|
|
"""
|
|
Convert the CoNLL IOB format to a tree.
|
|
"""
|
|
tree = Tree(root_label, [])
|
|
for (word, postag, chunktag) in sentence:
|
|
if chunktag is None:
|
|
if strict:
|
|
raise ValueError("Bad conll tag sequence")
|
|
else:
|
|
# Treat as O
|
|
tree.append((word, postag))
|
|
elif chunktag.startswith('B-'):
|
|
tree.append(Tree(chunktag[2:], [(word, postag)]))
|
|
elif chunktag.startswith('I-'):
|
|
if (
|
|
len(tree) == 0
|
|
or not isinstance(tree[-1], Tree)
|
|
or tree[-1].label() != chunktag[2:]
|
|
):
|
|
if strict:
|
|
raise ValueError("Bad conll tag sequence")
|
|
else:
|
|
# Treat as B-*
|
|
tree.append(Tree(chunktag[2:], [(word, postag)]))
|
|
else:
|
|
tree[-1].append((word, postag))
|
|
elif chunktag == 'O':
|
|
tree.append((word, postag))
|
|
else:
|
|
raise ValueError("Bad conll tag {0!r}".format(chunktag))
|
|
return tree
|
|
|
|
|
|
def tree2conllstr(t):
|
|
"""
|
|
Return a multiline string where each line contains a word, tag and IOB tag.
|
|
Convert a tree to the CoNLL IOB string format
|
|
|
|
:param t: The tree to be converted.
|
|
:type t: Tree
|
|
:rtype: str
|
|
"""
|
|
lines = [" ".join(token) for token in tree2conlltags(t)]
|
|
return '\n'.join(lines)
|
|
|
|
|
|
### IEER
|
|
|
|
_IEER_DOC_RE = re.compile(
|
|
r'<DOC>\s*'
|
|
r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?'
|
|
r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?'
|
|
r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?'
|
|
r'<BODY>\s*'
|
|
r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?'
|
|
r'<TEXT>(?P<text>.*?)</TEXT>\s*'
|
|
r'</BODY>\s*</DOC>\s*',
|
|
re.DOTALL,
|
|
)
|
|
|
|
_IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
|
|
|
|
|
|
def _ieer_read_text(s, root_label):
|
|
stack = [Tree(root_label, [])]
|
|
# s will be None if there is no headline in the text
|
|
# return the empty list in place of a Tree
|
|
if s is None:
|
|
return []
|
|
for piece_m in re.finditer('<[^>]+>|[^\s<]+', s):
|
|
piece = piece_m.group()
|
|
try:
|
|
if piece.startswith('<b_'):
|
|
m = _IEER_TYPE_RE.match(piece)
|
|
if m is None:
|
|
print('XXXX', piece)
|
|
chunk = Tree(m.group('type'), [])
|
|
stack[-1].append(chunk)
|
|
stack.append(chunk)
|
|
elif piece.startswith('<e_'):
|
|
stack.pop()
|
|
# elif piece.startswith('<'):
|
|
# print "ERROR:", piece
|
|
# raise ValueError # Unexpected HTML
|
|
else:
|
|
stack[-1].append(piece)
|
|
except (IndexError, ValueError):
|
|
raise ValueError(
|
|
'Bad IEER string (error at character {:d})'.format(piece_m.start())
|
|
)
|
|
if len(stack) != 1:
|
|
raise ValueError('Bad IEER string')
|
|
return stack[0]
|
|
|
|
|
|
def ieerstr2tree(
|
|
s,
|
|
chunk_types=[
|
|
'LOCATION',
|
|
'ORGANIZATION',
|
|
'PERSON',
|
|
'DURATION',
|
|
'DATE',
|
|
'CARDINAL',
|
|
'PERCENT',
|
|
'MONEY',
|
|
'MEASURE',
|
|
],
|
|
root_label="S",
|
|
):
|
|
"""
|
|
Return a chunk structure containing the chunked tagged text that is
|
|
encoded in the given IEER style string.
|
|
Convert a string of chunked tagged text in the IEER named
|
|
entity format into a chunk structure. Chunks are of several
|
|
types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
|
|
PERCENT, MONEY, and MEASURE.
|
|
|
|
:rtype: Tree
|
|
"""
|
|
|
|
# Try looking for a single document. If that doesn't work, then just
|
|
# treat everything as if it was within the <TEXT>...</TEXT>.
|
|
m = _IEER_DOC_RE.match(s)
|
|
if m:
|
|
return {
|
|
'text': _ieer_read_text(m.group('text'), root_label),
|
|
'docno': m.group('docno'),
|
|
'doctype': m.group('doctype'),
|
|
'date_time': m.group('date_time'),
|
|
#'headline': m.group('headline')
|
|
# we want to capture NEs in the headline too!
|
|
'headline': _ieer_read_text(m.group('headline'), root_label),
|
|
}
|
|
else:
|
|
return _ieer_read_text(s, root_label)
|
|
|
|
|
|
def demo():
|
|
|
|
s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
|
|
import nltk
|
|
|
|
t = nltk.chunk.tagstr2tree(s, chunk_label='NP')
|
|
t.pprint()
|
|
print()
|
|
|
|
s = """
|
|
These DT B-NP
|
|
research NN I-NP
|
|
protocols NNS I-NP
|
|
offer VBP B-VP
|
|
to TO B-PP
|
|
the DT B-NP
|
|
patient NN I-NP
|
|
not RB O
|
|
only RB O
|
|
the DT B-NP
|
|
very RB I-NP
|
|
best JJS I-NP
|
|
therapy NN I-NP
|
|
which WDT B-NP
|
|
we PRP B-NP
|
|
have VBP B-VP
|
|
established VBN I-VP
|
|
today NN B-NP
|
|
but CC B-NP
|
|
also RB I-NP
|
|
the DT B-NP
|
|
hope NN I-NP
|
|
of IN B-PP
|
|
something NN B-NP
|
|
still RB B-ADJP
|
|
better JJR I-ADJP
|
|
. . O
|
|
"""
|
|
|
|
conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP'))
|
|
conll_tree.pprint()
|
|
|
|
# Demonstrate CoNLL output
|
|
print("CoNLL output:")
|
|
print(nltk.chunk.tree2conllstr(conll_tree))
|
|
print()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
demo()
|