You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

645 lines
20 KiB
Python

# Natural Language Toolkit: Chunk format conversions
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import re
from nltk.tree import Tree
from nltk.tag.mapping import map_tag
from nltk.tag.util import str2tuple
##//////////////////////////////////////////////////////
## EVALUATION
##//////////////////////////////////////////////////////
from nltk.metrics import accuracy as _accuracy
def accuracy(chunker, gold):
"""
Score the accuracy of the chunker against the gold standard.
Strip the chunk information from the gold standard and rechunk it using
the chunker, then compute the accuracy score.
:type chunker: ChunkParserI
:param chunker: The chunker being evaluated.
:type gold: tree
:param gold: The chunk structures to score the chunker on.
:rtype: float
"""
gold_tags = []
test_tags = []
for gold_tree in gold:
test_tree = chunker.parse(gold_tree.flatten())
gold_tags += tree2conlltags(gold_tree)
test_tags += tree2conlltags(test_tree)
# print 'GOLD:', gold_tags[:50]
# print 'TEST:', test_tags[:50]
return _accuracy(gold_tags, test_tags)
# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
# -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
#
class ChunkScore(object):
"""
A utility class for scoring chunk parsers. ``ChunkScore`` can
evaluate a chunk parser's output, based on a number of statistics
(precision, recall, f-measure, misssed chunks, incorrect chunks).
It can also combine the scores from the parsing of multiple texts;
this makes it significantly easier to evaluate a chunk parser that
operates one sentence at a time.
Texts are evaluated with the ``score`` method. The results of
evaluation can be accessed via a number of accessor methods, such
as ``precision`` and ``f_measure``. A typical use of the
``ChunkScore`` class is::
>>> chunkscore = ChunkScore() # doctest: +SKIP
>>> for correct in correct_sentences: # doctest: +SKIP
... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
... chunkscore.score(correct, guess) # doctest: +SKIP
>>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
F Measure: 0.823
:ivar kwargs: Keyword arguments:
- max_tp_examples: The maximum number actual examples of true
positives to record. This affects the ``correct`` member
function: ``correct`` will not return more than this number
of true positive examples. This does *not* affect any of
the numerical metrics (precision, recall, or f-measure)
- max_fp_examples: The maximum number actual examples of false
positives to record. This affects the ``incorrect`` member
function and the ``guessed`` member function: ``incorrect``
will not return more than this number of examples, and
``guessed`` will not return more than this number of true
positive examples. This does *not* affect any of the
numerical metrics (precision, recall, or f-measure)
- max_fn_examples: The maximum number actual examples of false
negatives to record. This affects the ``missed`` member
function and the ``correct`` member function: ``missed``
will not return more than this number of examples, and
``correct`` will not return more than this number of true
negative examples. This does *not* affect any of the
numerical metrics (precision, recall, or f-measure)
- chunk_label: A regular expression indicating which chunks
should be compared. Defaults to ``'.*'`` (i.e., all chunks).
:type _tp: list(Token)
:ivar _tp: List of true positives
:type _fp: list(Token)
:ivar _fp: List of false positives
:type _fn: list(Token)
:ivar _fn: List of false negatives
:type _tp_num: int
:ivar _tp_num: Number of true positives
:type _fp_num: int
:ivar _fp_num: Number of false positives
:type _fn_num: int
:ivar _fn_num: Number of false negatives.
"""
def __init__(self, **kwargs):
self._correct = set()
self._guessed = set()
self._tp = set()
self._fp = set()
self._fn = set()
self._max_tp = kwargs.get("max_tp_examples", 100)
self._max_fp = kwargs.get("max_fp_examples", 100)
self._max_fn = kwargs.get("max_fn_examples", 100)
self._chunk_label = kwargs.get("chunk_label", ".*")
self._tp_num = 0
self._fp_num = 0
self._fn_num = 0
self._count = 0
self._tags_correct = 0.0
self._tags_total = 0.0
self._measuresNeedUpdate = False
def _updateMeasures(self):
if self._measuresNeedUpdate:
self._tp = self._guessed & self._correct
self._fn = self._correct - self._guessed
self._fp = self._guessed - self._correct
self._tp_num = len(self._tp)
self._fp_num = len(self._fp)
self._fn_num = len(self._fn)
self._measuresNeedUpdate = False
def score(self, correct, guessed):
"""
Given a correctly chunked sentence, score another chunked
version of the same sentence.
:type correct: chunk structure
:param correct: The known-correct ("gold standard") chunked
sentence.
:type guessed: chunk structure
:param guessed: The chunked sentence to be scored.
"""
self._correct |= _chunksets(correct, self._count, self._chunk_label)
self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
self._count += 1
self._measuresNeedUpdate = True
# Keep track of per-tag accuracy (if possible)
try:
correct_tags = tree2conlltags(correct)
guessed_tags = tree2conlltags(guessed)
except ValueError:
# This exception case is for nested chunk structures,
# where tree2conlltags will fail with a ValueError: "Tree
# is too deeply nested to be printed in CoNLL format."
correct_tags = guessed_tags = ()
self._tags_total += len(correct_tags)
self._tags_correct += sum(
1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
)
def accuracy(self):
"""
Return the overall tag-based accuracy for all text that have
been scored by this ``ChunkScore``, using the IOB (conll2000)
tag encoding.
:rtype: float
"""
if self._tags_total == 0:
return 1
return self._tags_correct / self._tags_total
def precision(self):
"""
Return the overall precision for all texts that have been
scored by this ``ChunkScore``.
:rtype: float
"""
self._updateMeasures()
div = self._tp_num + self._fp_num
if div == 0:
return 0
else:
return self._tp_num / div
def recall(self):
"""
Return the overall recall for all texts that have been
scored by this ``ChunkScore``.
:rtype: float
"""
self._updateMeasures()
div = self._tp_num + self._fn_num
if div == 0:
return 0
else:
return self._tp_num / div
def f_measure(self, alpha=0.5):
"""
Return the overall F measure for all texts that have been
scored by this ``ChunkScore``.
:param alpha: the relative weighting of precision and recall.
Larger alpha biases the score towards the precision value,
while smaller alpha biases the score towards the recall
value. ``alpha`` should have a value in the range [0,1].
:type alpha: float
:rtype: float
"""
self._updateMeasures()
p = self.precision()
r = self.recall()
if p == 0 or r == 0: # what if alpha is 0 or 1?
return 0
return 1 / (alpha / p + (1 - alpha) / r)
def missed(self):
"""
Return the chunks which were included in the
correct chunk structures, but not in the guessed chunk
structures, listed in input order.
:rtype: list of chunks
"""
self._updateMeasures()
chunks = list(self._fn)
return [c[1] for c in chunks] # discard position information
def incorrect(self):
"""
Return the chunks which were included in the guessed chunk structures,
but not in the correct chunk structures, listed in input order.
:rtype: list of chunks
"""
self._updateMeasures()
chunks = list(self._fp)
return [c[1] for c in chunks] # discard position information
def correct(self):
"""
Return the chunks which were included in the correct
chunk structures, listed in input order.
:rtype: list of chunks
"""
chunks = list(self._correct)
return [c[1] for c in chunks] # discard position information
def guessed(self):
"""
Return the chunks which were included in the guessed
chunk structures, listed in input order.
:rtype: list of chunks
"""
chunks = list(self._guessed)
return [c[1] for c in chunks] # discard position information
def __len__(self):
self._updateMeasures()
return self._tp_num + self._fn_num
def __repr__(self):
"""
Return a concise representation of this ``ChunkScoring``.
:rtype: str
"""
return "<ChunkScoring of " + repr(len(self)) + " chunks>"
def __str__(self):
"""
Return a verbose representation of this ``ChunkScoring``.
This representation includes the precision, recall, and
f-measure scores. For other information about the score,
use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
:rtype: str
"""
return (
"ChunkParse score:\n"
+ (" IOB Accuracy: {:5.1f}%%\n".format(self.accuracy() * 100))
+ (" Precision: {:5.1f}%%\n".format(self.precision() * 100))
+ (" Recall: {:5.1f}%%\n".format(self.recall() * 100))
+ (" F-Measure: {:5.1f}%%".format(self.f_measure() * 100))
)
# extract chunks, and assign unique id, the absolute position of
# the first word of the chunk
def _chunksets(t, count, chunk_label):
pos = 0
chunks = []
for child in t:
if isinstance(child, Tree):
if re.match(chunk_label, child.label()):
chunks.append(((count, pos), child.freeze()))
pos += len(child.leaves())
else:
pos += 1
return set(chunks)
def tagstr2tree(
s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
):
"""
Divide a string of bracketted tagged text into
chunks and unchunked tokens, and produce a Tree.
Chunks are marked by square brackets (``[...]``). Words are
delimited by whitespace, and each word should have the form
``text/tag``. Words that do not contain a slash are
assigned a ``tag`` of None.
:param s: The string to be converted
:type s: str
:param chunk_label: The label to use for chunk nodes
:type chunk_label: str
:param root_label: The label to use for the root of the tree
:type root_label: str
:rtype: Tree
"""
WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
stack = [Tree(root_label, [])]
for match in WORD_OR_BRACKET.finditer(s):
text = match.group()
if text[0] == "[":
if len(stack) != 1:
raise ValueError("Unexpected [ at char {:d}".format(match.start()))
chunk = Tree(chunk_label, [])
stack[-1].append(chunk)
stack.append(chunk)
elif text[0] == "]":
if len(stack) != 2:
raise ValueError("Unexpected ] at char {:d}".format(match.start()))
stack.pop()
else:
if sep is None:
stack[-1].append(text)
else:
word, tag = str2tuple(text, sep)
if source_tagset and target_tagset:
tag = map_tag(source_tagset, target_tagset, tag)
stack[-1].append((word, tag))
if len(stack) != 1:
raise ValueError("Expected ] at char {:d}".format(len(s)))
return stack[0]
### CONLL
_LINE_RE = re.compile("(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
"""
Return a chunk structure for a single sentence
encoded in the given CONLL 2000 style string.
This function converts a CoNLL IOB string into a tree.
It uses the specified chunk types
(defaults to NP, PP and VP), and creates a tree rooted at a node
labeled S (by default).
:param s: The CoNLL string to be converted.
:type s: str
:param chunk_types: The chunk types to be converted.
:type chunk_types: tuple
:param root_label: The node label to use for the root.
:type root_label: str
:rtype: Tree
"""
stack = [Tree(root_label, [])]
for lineno, line in enumerate(s.split("\n")):
if not line.strip():
continue
# Decode the line.
match = _LINE_RE.match(line)
if match is None:
raise ValueError("Error on line {:d}".format(lineno))
(word, tag, state, chunk_type) = match.groups()
# If it's a chunk type we don't care about, treat it as O.
if chunk_types is not None and chunk_type not in chunk_types:
state = "O"
# For "Begin"/"Outside", finish any completed chunks -
# also do so for "Inside" which don't match the previous token.
mismatch_I = state == "I" and chunk_type != stack[-1].label()
if state in "BO" or mismatch_I:
if len(stack) == 2:
stack.pop()
# For "Begin", start a new chunk.
if state == "B" or mismatch_I:
chunk = Tree(chunk_type, [])
stack[-1].append(chunk)
stack.append(chunk)
# Add the new word token.
stack[-1].append((word, tag))
return stack[0]
def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
Convert a tree to the CoNLL IOB tag format.
:param t: The tree to be converted.
:type t: Tree
:rtype: list(tuple)
"""
tags = []
for child in t:
try:
category = child.label()
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
raise ValueError(
"Tree is too deeply nested to be printed in CoNLL format"
)
tags.append((contents[0], contents[1], prefix + category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
def conlltags2tree(
sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
):
"""
Convert the CoNLL IOB format to a tree.
"""
tree = Tree(root_label, [])
for (word, postag, chunktag) in sentence:
if chunktag is None:
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as O
tree.append((word, postag))
elif chunktag.startswith("B-"):
tree.append(Tree(chunktag[2:], [(word, postag)]))
elif chunktag.startswith("I-"):
if (
len(tree) == 0
or not isinstance(tree[-1], Tree)
or tree[-1].label() != chunktag[2:]
):
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as B-*
tree.append(Tree(chunktag[2:], [(word, postag)]))
else:
tree[-1].append((word, postag))
elif chunktag == "O":
tree.append((word, postag))
else:
raise ValueError("Bad conll tag {0!r}".format(chunktag))
return tree
def tree2conllstr(t):
"""
Return a multiline string where each line contains a word, tag and IOB tag.
Convert a tree to the CoNLL IOB string format
:param t: The tree to be converted.
:type t: Tree
:rtype: str
"""
lines = [" ".join(token) for token in tree2conlltags(t)]
return "\n".join(lines)
### IEER
_IEER_DOC_RE = re.compile(
r"<DOC>\s*"
r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
r"<BODY>\s*"
r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
r"<TEXT>(?P<text>.*?)</TEXT>\s*"
r"</BODY>\s*</DOC>\s*",
re.DOTALL,
)
_IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
def _ieer_read_text(s, root_label):
stack = [Tree(root_label, [])]
# s will be None if there is no headline in the text
# return the empty list in place of a Tree
if s is None:
return []
for piece_m in re.finditer("<[^>]+>|[^\s<]+", s):
piece = piece_m.group()
try:
if piece.startswith("<b_"):
m = _IEER_TYPE_RE.match(piece)
if m is None:
print("XXXX", piece)
chunk = Tree(m.group("type"), [])
stack[-1].append(chunk)
stack.append(chunk)
elif piece.startswith("<e_"):
stack.pop()
# elif piece.startswith('<'):
# print "ERROR:", piece
# raise ValueError # Unexpected HTML
else:
stack[-1].append(piece)
except (IndexError, ValueError):
raise ValueError(
"Bad IEER string (error at character {:d})".format(piece_m.start())
)
if len(stack) != 1:
raise ValueError("Bad IEER string")
return stack[0]
def ieerstr2tree(
s,
chunk_types=[
"LOCATION",
"ORGANIZATION",
"PERSON",
"DURATION",
"DATE",
"CARDINAL",
"PERCENT",
"MONEY",
"MEASURE",
],
root_label="S",
):
"""
Return a chunk structure containing the chunked tagged text that is
encoded in the given IEER style string.
Convert a string of chunked tagged text in the IEER named
entity format into a chunk structure. Chunks are of several
types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
PERCENT, MONEY, and MEASURE.
:rtype: Tree
"""
# Try looking for a single document. If that doesn't work, then just
# treat everything as if it was within the <TEXT>...</TEXT>.
m = _IEER_DOC_RE.match(s)
if m:
return {
"text": _ieer_read_text(m.group("text"), root_label),
"docno": m.group("docno"),
"doctype": m.group("doctype"),
"date_time": m.group("date_time"),
#'headline': m.group('headline')
# we want to capture NEs in the headline too!
"headline": _ieer_read_text(m.group("headline"), root_label),
}
else:
return _ieer_read_text(s, root_label)
def demo():
s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
import nltk
t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
t.pprint()
print()
s = """
These DT B-NP
research NN I-NP
protocols NNS I-NP
offer VBP B-VP
to TO B-PP
the DT B-NP
patient NN I-NP
not RB O
only RB O
the DT B-NP
very RB I-NP
best JJS I-NP
therapy NN I-NP
which WDT B-NP
we PRP B-NP
have VBP B-VP
established VBN I-VP
today NN B-NP
but CC B-NP
also RB I-NP
the DT B-NP
hope NN I-NP
of IN B-PP
something NN B-NP
still RB B-ADJP
better JJR I-ADJP
. . O
"""
conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
conll_tree.pprint()
# Demonstrate CoNLL output
print("CoNLL output:")
print(nltk.chunk.tree2conllstr(conll_tree))
print()
if __name__ == "__main__":
demo()