bo-graduation/venv/lib/python3.7/site-packages/nltk/chunk/util.py

# Natural Language Toolkit: Chunk format conversions
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
#         Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

import re

from nltk.tree import Tree
from nltk.tag.mapping import map_tag
from nltk.tag.util import str2tuple

##//////////////////////////////////////////////////////
## EVALUATION
##//////////////////////////////////////////////////////

from nltk.metrics import accuracy as _accuracy


def accuracy(chunker, gold):
    """
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    """

    gold_tags = []
    test_tags = []
    for gold_tree in gold:
        test_tree = chunker.parse(gold_tree.flatten())
        gold_tags += tree2conlltags(gold_tree)
        test_tags += tree2conlltags(test_tree)

    #    print 'GOLD:', gold_tags[:50]
    #    print 'TEST:', test_tags[:50]
    return _accuracy(gold_tags, test_tags)


# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
#
class ChunkScore(object):
    """
    A utility class for scoring chunk parsers.  ``ChunkScore`` can
    evaluate a chunk parser's output, based on a number of statistics
    (precision, recall, f-measure, misssed chunks, incorrect chunks).
    It can also combine the scores from the parsing of multiple texts;
    this makes it significantly easier to evaluate a chunk parser that
    operates one sentence at a time.

    Texts are evaluated with the ``score`` method.  The results of
    evaluation can be accessed via a number of accessor methods, such
    as ``precision`` and ``f_measure``.  A typical use of the
    ``ChunkScore`` class is::

        >>> chunkscore = ChunkScore()           # doctest: +SKIP
        >>> for correct in correct_sentences:   # doctest: +SKIP
        ...     guess = chunkparser.parse(correct.leaves())   # doctest: +SKIP
        ...     chunkscore.score(correct, guess)              # doctest: +SKIP
        >>> print('F Measure:', chunkscore.f_measure())       # doctest: +SKIP
        F Measure: 0.823

    :ivar kwargs: Keyword arguments:

        - max_tp_examples: The maximum number actual examples of true
          positives to record.  This affects the ``correct`` member
          function: ``correct`` will not return more than this number
          of true positive examples.  This does *not* affect any of
          the numerical metrics (precision, recall, or f-measure)

        - max_fp_examples: The maximum number actual examples of false
          positives to record.  This affects the ``incorrect`` member
          function and the ``guessed`` member function: ``incorrect``
          will not return more than this number of examples, and
          ``guessed`` will not return more than this number of true
          positive examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - max_fn_examples: The maximum number actual examples of false
          negatives to record.  This affects the ``missed`` member
          function and the ``correct`` member function: ``missed``
          will not return more than this number of examples, and
          ``correct`` will not return more than this number of true
          negative examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - chunk_label: A regular expression indicating which chunks
          should be compared.  Defaults to ``'.*'`` (i.e., all chunks).

    :type _tp: list(Token)
    :ivar _tp: List of true positives
    :type _fp: list(Token)
    :ivar _fp: List of false positives
    :type _fn: list(Token)
    :ivar _fn: List of false negatives

    :type _tp_num: int
    :ivar _tp_num: Number of true positives
    :type _fp_num: int
    :ivar _fp_num: Number of false positives
    :type _fn_num: int
    :ivar _fn_num: Number of false negatives.
    """

    def __init__(self, **kwargs):
        self._correct = set()
        self._guessed = set()
        self._tp = set()
        self._fp = set()
        self._fn = set()
        self._max_tp = kwargs.get("max_tp_examples", 100)
        self._max_fp = kwargs.get("max_fp_examples", 100)
        self._max_fn = kwargs.get("max_fn_examples", 100)
        self._chunk_label = kwargs.get("chunk_label", ".*")
        self._tp_num = 0
        self._fp_num = 0
        self._fn_num = 0
        self._count = 0
        self._tags_correct = 0.0
        self._tags_total = 0.0

        self._measuresNeedUpdate = False

    def _updateMeasures(self):
        if self._measuresNeedUpdate:
            self._tp = self._guessed & self._correct
            self._fn = self._correct - self._guessed
            self._fp = self._guessed - self._correct
            self._tp_num = len(self._tp)
            self._fp_num = len(self._fp)
            self._fn_num = len(self._fn)
            self._measuresNeedUpdate = False

    def score(self, correct, guessed):
        """
        Given a correctly chunked sentence, score another chunked
        version of the same sentence.

        :type correct: chunk structure
        :param correct: The known-correct ("gold standard") chunked
            sentence.
        :type guessed: chunk structure
        :param guessed: The chunked sentence to be scored.
        """
        self._correct |= _chunksets(correct, self._count, self._chunk_label)
        self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
        self._count += 1
        self._measuresNeedUpdate = True
        # Keep track of per-tag accuracy (if possible)
        try:
            correct_tags = tree2conlltags(correct)
            guessed_tags = tree2conlltags(guessed)
        except ValueError:
            # This exception case is for nested chunk structures,
            # where tree2conlltags will fail with a ValueError: "Tree
            # is too deeply nested to be printed in CoNLL format."
            correct_tags = guessed_tags = ()
        self._tags_total += len(correct_tags)
        self._tags_correct += sum(
            1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
        )

    def accuracy(self):
        """
        Return the overall tag-based accuracy for all text that have
        been scored by this ``ChunkScore``, using the IOB (conll2000)
        tag encoding.

        :rtype: float
        """
        if self._tags_total == 0:
            return 1
        return self._tags_correct / self._tags_total

    def precision(self):
        """
        Return the overall precision for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        """
        self._updateMeasures()
        div = self._tp_num + self._fp_num
        if div == 0:
            return 0
        else:
            return self._tp_num / div

    def recall(self):
        """
        Return the overall recall for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        """
        self._updateMeasures()
        div = self._tp_num + self._fn_num
        if div == 0:
            return 0
        else:
            return self._tp_num / div

    def f_measure(self, alpha=0.5):
        """
        Return the overall F measure for all texts that have been
        scored by this ``ChunkScore``.

        :param alpha: the relative weighting of precision and recall.
            Larger alpha biases the score towards the precision value,
            while smaller alpha biases the score towards the recall
            value.  ``alpha`` should have a value in the range [0,1].
        :type alpha: float
        :rtype: float
        """
        self._updateMeasures()
        p = self.precision()
        r = self.recall()
        if p == 0 or r == 0:  # what if alpha is 0 or 1?
            return 0
        return 1 / (alpha / p + (1 - alpha) / r)

    def missed(self):
        """
        Return the chunks which were included in the
        correct chunk structures, but not in the guessed chunk
        structures, listed in input order.

        :rtype: list of chunks
        """
        self._updateMeasures()
        chunks = list(self._fn)
        return [c[1] for c in chunks]  # discard position information

    def incorrect(self):
        """
        Return the chunks which were included in the guessed chunk structures,
        but not in the correct chunk structures, listed in input order.

        :rtype: list of chunks
        """
        self._updateMeasures()
        chunks = list(self._fp)
        return [c[1] for c in chunks]  # discard position information

    def correct(self):
        """
        Return the chunks which were included in the correct
        chunk structures, listed in input order.

        :rtype: list of chunks
        """
        chunks = list(self._correct)
        return [c[1] for c in chunks]  # discard position information

    def guessed(self):
        """
        Return the chunks which were included in the guessed
        chunk structures, listed in input order.

        :rtype: list of chunks
        """
        chunks = list(self._guessed)
        return [c[1] for c in chunks]  # discard position information

    def __len__(self):
        self._updateMeasures()
        return self._tp_num + self._fn_num

    def __repr__(self):
        """
        Return a concise representation of this ``ChunkScoring``.

        :rtype: str
        """
        return "<ChunkScoring of " + repr(len(self)) + " chunks>"

    def __str__(self):
        """
        Return a verbose representation of this ``ChunkScoring``.
        This representation includes the precision, recall, and
        f-measure scores.  For other information about the score,
        use the accessor methods (e.g., ``missed()`` and ``incorrect()``).

        :rtype: str
        """
        return (
            "ChunkParse score:\n"
            + ("    IOB Accuracy: {:5.1f}%%\n".format(self.accuracy() * 100))
            + ("    Precision:    {:5.1f}%%\n".format(self.precision() * 100))
            + ("    Recall:       {:5.1f}%%\n".format(self.recall() * 100))
            + ("    F-Measure:    {:5.1f}%%".format(self.f_measure() * 100))
        )


# extract chunks, and assign unique id, the absolute position of
# the first word of the chunk
def _chunksets(t, count, chunk_label):
    pos = 0
    chunks = []
    for child in t:
        if isinstance(child, Tree):
            if re.match(chunk_label, child.label()):
                chunks.append(((count, pos), child.freeze()))
            pos += len(child.leaves())
        else:
            pos += 1
    return set(chunks)


def tagstr2tree(
    s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == "[":
            if len(stack) != 1:
                raise ValueError("Unexpected [ at char {:d}".format(match.start()))
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == "]":
            if len(stack) != 2:
                raise ValueError("Unexpected ] at char {:d}".format(match.start()))
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                word, tag = str2tuple(text, sep)
                if source_tagset and target_tagset:
                    tag = map_tag(source_tagset, target_tagset, tag)
                stack[-1].append((word, tag))

    if len(stack) != 1:
        raise ValueError("Expected ] at char {:d}".format(len(s)))
    return stack[0]


### CONLL

_LINE_RE = re.compile("(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")


def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
    """
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param root_label: The node label to use for the root.
    :type root_label: str
    :rtype: Tree
    """

    stack = [Tree(root_label, [])]

    for lineno, line in enumerate(s.split("\n")):
        if not line.strip():
            continue

        # Decode the line.
        match = _LINE_RE.match(line)
        if match is None:
            raise ValueError("Error on line {:d}".format(lineno))
        (word, tag, state, chunk_type) = match.groups()

        # If it's a chunk type we don't care about, treat it as O.
        if chunk_types is not None and chunk_type not in chunk_types:
            state = "O"

        # For "Begin"/"Outside", finish any completed chunks -
        # also do so for "Inside" which don't match the previous token.
        mismatch_I = state == "I" and chunk_type != stack[-1].label()
        if state in "BO" or mismatch_I:
            if len(stack) == 2:
                stack.pop()

        # For "Begin", start a new chunk.
        if state == "B" or mismatch_I:
            chunk = Tree(chunk_type, [])
            stack[-1].append(chunk)
            stack.append(chunk)

        # Add the new word token.
        stack[-1].append((word, tag))

    return stack[0]


def tree2conlltags(t):
    """
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    """

    tags = []
    for child in t:
        try:
            category = child.label()
            prefix = "B-"
            for contents in child:
                if isinstance(contents, Tree):
                    raise ValueError(
                        "Tree is too deeply nested to be printed in CoNLL format"
                    )
                tags.append((contents[0], contents[1], prefix + category))
                prefix = "I-"
        except AttributeError:
            tags.append((child[0], child[1], "O"))
    return tags


def conlltags2tree(
    sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word, postag))
        elif chunktag.startswith("B-"):
            tree.append(Tree(chunktag[2:], [(word, postag)]))
        elif chunktag.startswith("I-"):
            if (
                len(tree) == 0
                or not isinstance(tree[-1], Tree)
                or tree[-1].label() != chunktag[2:]
            ):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word, postag)]))
            else:
                tree[-1].append((word, postag))
        elif chunktag == "O":
            tree.append((word, postag))
        else:
            raise ValueError("Bad conll tag {0!r}".format(chunktag))
    return tree


def tree2conllstr(t):
    """
    Return a multiline string where each line contains a word, tag and IOB tag.
    Convert a tree to the CoNLL IOB string format

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: str
    """
    lines = [" ".join(token) for token in tree2conlltags(t)]
    return "\n".join(lines)


### IEER

_IEER_DOC_RE = re.compile(
    r"<DOC>\s*"
    r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
    r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
    r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
    r"<BODY>\s*"
    r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
    r"<TEXT>(?P<text>.*?)</TEXT>\s*"
    r"</BODY>\s*</DOC>\s*",
    re.DOTALL,
)

_IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')


def _ieer_read_text(s, root_label):
    stack = [Tree(root_label, [])]
    # s will be None if there is no headline in the text
    # return the empty list in place of a Tree
    if s is None:
        return []
    for piece_m in re.finditer("<[^>]+>|[^\s<]+", s):
        piece = piece_m.group()
        try:
            if piece.startswith("<b_"):
                m = _IEER_TYPE_RE.match(piece)
                if m is None:
                    print("XXXX", piece)
                chunk = Tree(m.group("type"), [])
                stack[-1].append(chunk)
                stack.append(chunk)
            elif piece.startswith("<e_"):
                stack.pop()
            #           elif piece.startswith('<'):
            #               print "ERROR:", piece
            #               raise ValueError # Unexpected HTML
            else:
                stack[-1].append(piece)
        except (IndexError, ValueError):
            raise ValueError(
                "Bad IEER string (error at character {:d})".format(piece_m.start())
            )
    if len(stack) != 1:
        raise ValueError("Bad IEER string")
    return stack[0]


def ieerstr2tree(
    s,
    chunk_types=[
        "LOCATION",
        "ORGANIZATION",
        "PERSON",
        "DURATION",
        "DATE",
        "CARDINAL",
        "PERCENT",
        "MONEY",
        "MEASURE",
    ],
    root_label="S",
):
    """
    Return a chunk structure containing the chunked tagged text that is
    encoded in the given IEER style string.
    Convert a string of chunked tagged text in the IEER named
    entity format into a chunk structure.  Chunks are of several
    types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
    PERCENT, MONEY, and MEASURE.

    :rtype: Tree
    """

    # Try looking for a single document.  If that doesn't work, then just
    # treat everything as if it was within the <TEXT>...</TEXT>.
    m = _IEER_DOC_RE.match(s)
    if m:
        return {
            "text": _ieer_read_text(m.group("text"), root_label),
            "docno": m.group("docno"),
            "doctype": m.group("doctype"),
            "date_time": m.group("date_time"),
            #'headline': m.group('headline')
            # we want to capture NEs in the headline too!
            "headline": _ieer_read_text(m.group("headline"), root_label),
        }
    else:
        return _ieer_read_text(s, root_label)


def demo():

    s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
    import nltk

    t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
    t.pprint()
    print()

    s = """
These DT B-NP
research NN I-NP
protocols NNS I-NP
offer VBP B-VP
to TO B-PP
the DT B-NP
patient NN I-NP
not RB O
only RB O
the DT B-NP
very RB I-NP
best JJS I-NP
therapy NN I-NP
which WDT B-NP
we PRP B-NP
have VBP B-VP
established VBN I-VP
today NN B-NP
but CC B-NP
also RB I-NP
the DT B-NP
hope NN I-NP
of IN B-PP
something NN B-NP
still RB B-ADJP
better JJR I-ADJP
. . O
"""

    conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
    conll_tree.pprint()

    # Demonstrate CoNLL output
    print("CoNLL output:")
    print(nltk.chunk.tree2conllstr(conll_tree))
    print()


if __name__ == "__main__":
    demo()
readme check 5 years ago			`# Natural Language Toolkit: Chunk format conversions`
			`#`
add tag_comparison_v3.py 5 years ago			`# Copyright (C) 2001-2020 NLTK Project`
readme check 5 years ago			`# Author: Edward Loper <edloper@gmail.com>`
			`# Steven Bird <stevenbird1@gmail.com> (minor additions)`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`import re`

			`from nltk.tree import Tree`
			`from nltk.tag.mapping import map_tag`
			`from nltk.tag.util import str2tuple`

			`##//////////////////////////////////////////////////////`
			`## EVALUATION`
			`##//////////////////////////////////////////////////////`

			`from nltk.metrics import accuracy as _accuracy`


			`def accuracy(chunker, gold):`
			`"""`
			`Score the accuracy of the chunker against the gold standard.`
			`Strip the chunk information from the gold standard and rechunk it using`
			`the chunker, then compute the accuracy score.`

			`:type chunker: ChunkParserI`
			`:param chunker: The chunker being evaluated.`
			`:type gold: tree`
			`:param gold: The chunk structures to score the chunker on.`
			`:rtype: float`
			`"""`

			`gold_tags = []`
			`test_tags = []`
			`for gold_tree in gold:`
			`test_tree = chunker.parse(gold_tree.flatten())`
			`gold_tags += tree2conlltags(gold_tree)`
			`test_tags += tree2conlltags(test_tree)`

			`# print 'GOLD:', gold_tags[:50]`
			`# print 'TEST:', test_tags[:50]`
			`return _accuracy(gold_tags, test_tags)`


			`# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13`
			`# -- statistics are evaluated only on demand, instead of at every sentence evaluation`
			`#`
			`# SB: use nltk.metrics for precision/recall scoring?`
			`#`
			`class ChunkScore(object):`
			`"""`
			A utility class for scoring chunk parsers. ``ChunkScore`` can
			`evaluate a chunk parser's output, based on a number of statistics`
			`(precision, recall, f-measure, misssed chunks, incorrect chunks).`
			`It can also combine the scores from the parsing of multiple texts;`
			`this makes it significantly easier to evaluate a chunk parser that`
			`operates one sentence at a time.`

			Texts are evaluated with the ``score`` method. The results of
			`evaluation can be accessed via a number of accessor methods, such`
			as ``precision`` and ``f_measure``. A typical use of the
			``ChunkScore`` class is::

			`>>> chunkscore = ChunkScore() # doctest: +SKIP`
			`>>> for correct in correct_sentences: # doctest: +SKIP`
			`... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP`
			`... chunkscore.score(correct, guess) # doctest: +SKIP`
			`>>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP`
			`F Measure: 0.823`

			`:ivar kwargs: Keyword arguments:`

			`- max_tp_examples: The maximum number actual examples of true`
			positives to record. This affects the ``correct`` member
			function: ``correct`` will not return more than this number
			`of true positive examples. This does not affect any of`
			`the numerical metrics (precision, recall, or f-measure)`

			`- max_fp_examples: The maximum number actual examples of false`
			positives to record. This affects the ``incorrect`` member
			function and the ``guessed`` member function: ``incorrect``
			`will not return more than this number of examples, and`
			``guessed`` will not return more than this number of true
			`positive examples. This does not affect any of the`
			`numerical metrics (precision, recall, or f-measure)`

			`- max_fn_examples: The maximum number actual examples of false`
			negatives to record. This affects the ``missed`` member
			function and the ``correct`` member function: ``missed``
			`will not return more than this number of examples, and`
			``correct`` will not return more than this number of true
			`negative examples. This does not affect any of the`
			`numerical metrics (precision, recall, or f-measure)`

			`- chunk_label: A regular expression indicating which chunks`
			should be compared. Defaults to ``'.*'`` (i.e., all chunks).

			`:type _tp: list(Token)`
			`:ivar _tp: List of true positives`
			`:type _fp: list(Token)`
			`:ivar _fp: List of false positives`
			`:type _fn: list(Token)`
			`:ivar _fn: List of false negatives`

			`:type _tp_num: int`
			`:ivar _tp_num: Number of true positives`
			`:type _fp_num: int`
			`:ivar _fp_num: Number of false positives`
			`:type _fn_num: int`
			`:ivar _fn_num: Number of false negatives.`
			`"""`

			`def __init__(self, **kwargs):`
			`self._correct = set()`
			`self._guessed = set()`
			`self._tp = set()`
			`self._fp = set()`
			`self._fn = set()`
add tag_comparison_v3.py 5 years ago			`self._max_tp = kwargs.get("max_tp_examples", 100)`
			`self._max_fp = kwargs.get("max_fp_examples", 100)`
			`self._max_fn = kwargs.get("max_fn_examples", 100)`
			`self._chunk_label = kwargs.get("chunk_label", ".*")`
readme check 5 years ago			`self._tp_num = 0`
			`self._fp_num = 0`
			`self._fn_num = 0`
			`self._count = 0`
			`self._tags_correct = 0.0`
			`self._tags_total = 0.0`

			`self._measuresNeedUpdate = False`

			`def _updateMeasures(self):`
			`if self._measuresNeedUpdate:`
			`self._tp = self._guessed & self._correct`
			`self._fn = self._correct - self._guessed`
			`self._fp = self._guessed - self._correct`
			`self._tp_num = len(self._tp)`
			`self._fp_num = len(self._fp)`
			`self._fn_num = len(self._fn)`
			`self._measuresNeedUpdate = False`

			`def score(self, correct, guessed):`
			`"""`
			`Given a correctly chunked sentence, score another chunked`
			`version of the same sentence.`

			`:type correct: chunk structure`
			`:param correct: The known-correct ("gold standard") chunked`
			`sentence.`
			`:type guessed: chunk structure`
			`:param guessed: The chunked sentence to be scored.`
			`"""`
			`self._correct \|= _chunksets(correct, self._count, self._chunk_label)`
			`self._guessed \|= _chunksets(guessed, self._count, self._chunk_label)`
			`self._count += 1`
			`self._measuresNeedUpdate = True`
			`# Keep track of per-tag accuracy (if possible)`
			`try:`
			`correct_tags = tree2conlltags(correct)`
			`guessed_tags = tree2conlltags(guessed)`
			`except ValueError:`
			`# This exception case is for nested chunk structures,`
			`# where tree2conlltags will fail with a ValueError: "Tree`
			`# is too deeply nested to be printed in CoNLL format."`
			`correct_tags = guessed_tags = ()`
			`self._tags_total += len(correct_tags)`
			`self._tags_correct += sum(`
			`1 for (t, g) in zip(guessed_tags, correct_tags) if t == g`
			`)`

			`def accuracy(self):`
			`"""`
			`Return the overall tag-based accuracy for all text that have`
			been scored by this ``ChunkScore``, using the IOB (conll2000)
			`tag encoding.`

			`:rtype: float`
			`"""`
			`if self._tags_total == 0:`
			`return 1`
			`return self._tags_correct / self._tags_total`

			`def precision(self):`
			`"""`
			`Return the overall precision for all texts that have been`
			scored by this ``ChunkScore``.

			`:rtype: float`
			`"""`
			`self._updateMeasures()`
			`div = self._tp_num + self._fp_num`
			`if div == 0:`
			`return 0`
			`else:`
			`return self._tp_num / div`

			`def recall(self):`
			`"""`
			`Return the overall recall for all texts that have been`
			scored by this ``ChunkScore``.

			`:rtype: float`
			`"""`
			`self._updateMeasures()`
			`div = self._tp_num + self._fn_num`
			`if div == 0:`
			`return 0`
			`else:`
			`return self._tp_num / div`

			`def f_measure(self, alpha=0.5):`
			`"""`
			`Return the overall F measure for all texts that have been`
			scored by this ``ChunkScore``.

			`:param alpha: the relative weighting of precision and recall.`
			`Larger alpha biases the score towards the precision value,`
			`while smaller alpha biases the score towards the recall`
			value. ``alpha`` should have a value in the range [0,1].
			`:type alpha: float`
			`:rtype: float`
			`"""`
			`self._updateMeasures()`
			`p = self.precision()`
			`r = self.recall()`
			`if p == 0 or r == 0: # what if alpha is 0 or 1?`
			`return 0`
			`return 1 / (alpha / p + (1 - alpha) / r)`

			`def missed(self):`
			`"""`
			`Return the chunks which were included in the`
			`correct chunk structures, but not in the guessed chunk`
			`structures, listed in input order.`

			`:rtype: list of chunks`
			`"""`
			`self._updateMeasures()`
			`chunks = list(self._fn)`
			`return [c[1] for c in chunks] # discard position information`

			`def incorrect(self):`
			`"""`
			`Return the chunks which were included in the guessed chunk structures,`
			`but not in the correct chunk structures, listed in input order.`

			`:rtype: list of chunks`
			`"""`
			`self._updateMeasures()`
			`chunks = list(self._fp)`
			`return [c[1] for c in chunks] # discard position information`

			`def correct(self):`
			`"""`
			`Return the chunks which were included in the correct`
			`chunk structures, listed in input order.`

			`:rtype: list of chunks`
			`"""`
			`chunks = list(self._correct)`
			`return [c[1] for c in chunks] # discard position information`

			`def guessed(self):`
			`"""`
			`Return the chunks which were included in the guessed`
			`chunk structures, listed in input order.`

			`:rtype: list of chunks`
			`"""`
			`chunks = list(self._guessed)`
			`return [c[1] for c in chunks] # discard position information`

			`def __len__(self):`
			`self._updateMeasures()`
			`return self._tp_num + self._fn_num`

			`def __repr__(self):`
			`"""`
			Return a concise representation of this ``ChunkScoring``.

			`:rtype: str`
			`"""`
add tag_comparison_v3.py 5 years ago			`return "<ChunkScoring of " + repr(len(self)) + " chunks>"`
readme check 5 years ago
			`def __str__(self):`
			`"""`
			Return a verbose representation of this ``ChunkScoring``.
			`This representation includes the precision, recall, and`
			`f-measure scores. For other information about the score,`
			use the accessor methods (e.g., ``missed()`` and ``incorrect()``).

			`:rtype: str`
			`"""`
			`return (`
			`"ChunkParse score:\n"`
			`+ (" IOB Accuracy: {:5.1f}%%\n".format(self.accuracy() * 100))`
			`+ (" Precision: {:5.1f}%%\n".format(self.precision() * 100))`
			`+ (" Recall: {:5.1f}%%\n".format(self.recall() * 100))`
			`+ (" F-Measure: {:5.1f}%%".format(self.f_measure() * 100))`
			`)`


			`# extract chunks, and assign unique id, the absolute position of`
			`# the first word of the chunk`
			`def _chunksets(t, count, chunk_label):`
			`pos = 0`
			`chunks = []`
			`for child in t:`
			`if isinstance(child, Tree):`
			`if re.match(chunk_label, child.label()):`
			`chunks.append(((count, pos), child.freeze()))`
			`pos += len(child.leaves())`
			`else:`
			`pos += 1`
			`return set(chunks)`


			`def tagstr2tree(`
add tag_comparison_v3.py 5 years ago			`s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None`
readme check 5 years ago			`):`
			`"""`
			`Divide a string of bracketted tagged text into`
			`chunks and unchunked tokens, and produce a Tree.`
			Chunks are marked by square brackets (``[...]``). Words are
			`delimited by whitespace, and each word should have the form`
			``text/tag``. Words that do not contain a slash are
			assigned a ``tag`` of None.

			`:param s: The string to be converted`
			`:type s: str`
			`:param chunk_label: The label to use for chunk nodes`
			`:type chunk_label: str`
			`:param root_label: The label to use for the root of the tree`
			`:type root_label: str`
			`:rtype: Tree`
			`"""`

add tag_comparison_v3.py 5 years ago			`WORD_OR_BRACKET = re.compile(r"\[\|\]\|[^\[\]\s]+")`
readme check 5 years ago
			`stack = [Tree(root_label, [])]`
			`for match in WORD_OR_BRACKET.finditer(s):`
			`text = match.group()`
add tag_comparison_v3.py 5 years ago			`if text[0] == "[":`
readme check 5 years ago			`if len(stack) != 1:`
add tag_comparison_v3.py 5 years ago			`raise ValueError("Unexpected [ at char {:d}".format(match.start()))`
readme check 5 years ago			`chunk = Tree(chunk_label, [])`
			`stack[-1].append(chunk)`
			`stack.append(chunk)`
add tag_comparison_v3.py 5 years ago			`elif text[0] == "]":`
readme check 5 years ago			`if len(stack) != 2:`
add tag_comparison_v3.py 5 years ago			`raise ValueError("Unexpected ] at char {:d}".format(match.start()))`
readme check 5 years ago			`stack.pop()`
			`else:`
			`if sep is None:`
			`stack[-1].append(text)`
			`else:`
			`word, tag = str2tuple(text, sep)`
			`if source_tagset and target_tagset:`
			`tag = map_tag(source_tagset, target_tagset, tag)`
			`stack[-1].append((word, tag))`

			`if len(stack) != 1:`
add tag_comparison_v3.py 5 years ago			`raise ValueError("Expected ] at char {:d}".format(len(s)))`
readme check 5 years ago			`return stack[0]`


			`### CONLL`

add tag_comparison_v3.py 5 years ago			`_LINE_RE = re.compile("(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")`
readme check 5 years ago

add tag_comparison_v3.py 5 years ago			`def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):`
readme check 5 years ago			`"""`
			`Return a chunk structure for a single sentence`
			`encoded in the given CONLL 2000 style string.`
			`This function converts a CoNLL IOB string into a tree.`
			`It uses the specified chunk types`
			`(defaults to NP, PP and VP), and creates a tree rooted at a node`
			`labeled S (by default).`

			`:param s: The CoNLL string to be converted.`
			`:type s: str`
			`:param chunk_types: The chunk types to be converted.`
			`:type chunk_types: tuple`
			`:param root_label: The node label to use for the root.`
			`:type root_label: str`
			`:rtype: Tree`
			`"""`

			`stack = [Tree(root_label, [])]`

add tag_comparison_v3.py 5 years ago			`for lineno, line in enumerate(s.split("\n")):`
readme check 5 years ago			`if not line.strip():`
			`continue`

			`# Decode the line.`
			`match = _LINE_RE.match(line)`
			`if match is None:`
add tag_comparison_v3.py 5 years ago			`raise ValueError("Error on line {:d}".format(lineno))`
readme check 5 years ago			`(word, tag, state, chunk_type) = match.groups()`

			`# If it's a chunk type we don't care about, treat it as O.`
			`if chunk_types is not None and chunk_type not in chunk_types:`
add tag_comparison_v3.py 5 years ago			`state = "O"`
readme check 5 years ago
			`# For "Begin"/"Outside", finish any completed chunks -`
			`# also do so for "Inside" which don't match the previous token.`
add tag_comparison_v3.py 5 years ago			`mismatch_I = state == "I" and chunk_type != stack[-1].label()`
			`if state in "BO" or mismatch_I:`
readme check 5 years ago			`if len(stack) == 2:`
			`stack.pop()`

			`# For "Begin", start a new chunk.`
add tag_comparison_v3.py 5 years ago			`if state == "B" or mismatch_I:`
readme check 5 years ago			`chunk = Tree(chunk_type, [])`
			`stack[-1].append(chunk)`
			`stack.append(chunk)`

			`# Add the new word token.`
			`stack[-1].append((word, tag))`

			`return stack[0]`


			`def tree2conlltags(t):`
			`"""`
			Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
			`Convert a tree to the CoNLL IOB tag format.`

			`:param t: The tree to be converted.`
			`:type t: Tree`
			`:rtype: list(tuple)`
			`"""`

			`tags = []`
			`for child in t:`
			`try:`
			`category = child.label()`
			`prefix = "B-"`
			`for contents in child:`
			`if isinstance(contents, Tree):`
			`raise ValueError(`
			`"Tree is too deeply nested to be printed in CoNLL format"`
			`)`
			`tags.append((contents[0], contents[1], prefix + category))`
			`prefix = "I-"`
			`except AttributeError:`
			`tags.append((child[0], child[1], "O"))`
			`return tags`


			`def conlltags2tree(`
add tag_comparison_v3.py 5 years ago			`sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False`
readme check 5 years ago			`):`
			`"""`
			`Convert the CoNLL IOB format to a tree.`
			`"""`
			`tree = Tree(root_label, [])`
			`for (word, postag, chunktag) in sentence:`
			`if chunktag is None:`
			`if strict:`
			`raise ValueError("Bad conll tag sequence")`
			`else:`
			`# Treat as O`
			`tree.append((word, postag))`
add tag_comparison_v3.py 5 years ago			`elif chunktag.startswith("B-"):`
readme check 5 years ago			`tree.append(Tree(chunktag[2:], [(word, postag)]))`
add tag_comparison_v3.py 5 years ago			`elif chunktag.startswith("I-"):`
readme check 5 years ago			`if (`
			`len(tree) == 0`
			`or not isinstance(tree[-1], Tree)`
			`or tree[-1].label() != chunktag[2:]`
			`):`
			`if strict:`
			`raise ValueError("Bad conll tag sequence")`
			`else:`
			`# Treat as B-*`
			`tree.append(Tree(chunktag[2:], [(word, postag)]))`
			`else:`
			`tree[-1].append((word, postag))`
add tag_comparison_v3.py 5 years ago			`elif chunktag == "O":`
readme check 5 years ago			`tree.append((word, postag))`
			`else:`
			`raise ValueError("Bad conll tag {0!r}".format(chunktag))`
			`return tree`


			`def tree2conllstr(t):`
			`"""`
			`Return a multiline string where each line contains a word, tag and IOB tag.`
			`Convert a tree to the CoNLL IOB string format`

			`:param t: The tree to be converted.`
			`:type t: Tree`
			`:rtype: str`
			`"""`
			`lines = [" ".join(token) for token in tree2conlltags(t)]`
add tag_comparison_v3.py 5 years ago			`return "\n".join(lines)`
readme check 5 years ago

			`### IEER`

			`_IEER_DOC_RE = re.compile(`
add tag_comparison_v3.py 5 years ago			`r"<DOC>\s*"`
			`r"(<DOCNO>\s(?P<docno>.+?)\s</DOCNO>\s*)?"`
			`r"(<DOCTYPE>\s(?P<doctype>.+?)\s</DOCTYPE>\s*)?"`
			`r"(<DATE_TIME>\s(?P<date_time>.+?)\s</DATE_TIME>\s*)?"`
			`r"<BODY>\s*"`
			`r"(<HEADLINE>\s(?P<headline>.+?)\s</HEADLINE>\s*)?"`
			`r"<TEXT>(?P<text>.?)</TEXT>\s"`
			`r"</BODY>\s</DOC>\s",`
readme check 5 years ago			`re.DOTALL,`
			`)`

			`_IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')`


			`def _ieer_read_text(s, root_label):`
			`stack = [Tree(root_label, [])]`
			`# s will be None if there is no headline in the text`
			`# return the empty list in place of a Tree`
			`if s is None:`
			`return []`
add tag_comparison_v3.py 5 years ago			`for piece_m in re.finditer("<[^>]+>\|[^\s<]+", s):`
readme check 5 years ago			`piece = piece_m.group()`
			`try:`
add tag_comparison_v3.py 5 years ago			`if piece.startswith("<b_"):`
readme check 5 years ago			`m = _IEER_TYPE_RE.match(piece)`
			`if m is None:`
add tag_comparison_v3.py 5 years ago			`print("XXXX", piece)`
			`chunk = Tree(m.group("type"), [])`
readme check 5 years ago			`stack[-1].append(chunk)`
			`stack.append(chunk)`
add tag_comparison_v3.py 5 years ago			`elif piece.startswith("<e_"):`
readme check 5 years ago			`stack.pop()`
			`# elif piece.startswith('<'):`
			`# print "ERROR:", piece`
			`# raise ValueError # Unexpected HTML`
			`else:`
			`stack[-1].append(piece)`
			`except (IndexError, ValueError):`
			`raise ValueError(`
add tag_comparison_v3.py 5 years ago			`"Bad IEER string (error at character {:d})".format(piece_m.start())`
readme check 5 years ago			`)`
			`if len(stack) != 1:`
add tag_comparison_v3.py 5 years ago			`raise ValueError("Bad IEER string")`
readme check 5 years ago			`return stack[0]`


			`def ieerstr2tree(`
			`s,`
			`chunk_types=[`
add tag_comparison_v3.py 5 years ago			`"LOCATION",`
			`"ORGANIZATION",`
			`"PERSON",`
			`"DURATION",`
			`"DATE",`
			`"CARDINAL",`
			`"PERCENT",`
			`"MONEY",`
			`"MEASURE",`
readme check 5 years ago			`],`
			`root_label="S",`
			`):`
			`"""`
			`Return a chunk structure containing the chunked tagged text that is`
			`encoded in the given IEER style string.`
			`Convert a string of chunked tagged text in the IEER named`
			`entity format into a chunk structure. Chunks are of several`
			`types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,`
			`PERCENT, MONEY, and MEASURE.`

			`:rtype: Tree`
			`"""`

			`# Try looking for a single document. If that doesn't work, then just`
			`# treat everything as if it was within the <TEXT>...</TEXT>.`
			`m = _IEER_DOC_RE.match(s)`
			`if m:`
			`return {`
add tag_comparison_v3.py 5 years ago			`"text": _ieer_read_text(m.group("text"), root_label),`
			`"docno": m.group("docno"),`
			`"doctype": m.group("doctype"),`
			`"date_time": m.group("date_time"),`
readme check 5 years ago			`#'headline': m.group('headline')`
			`# we want to capture NEs in the headline too!`
add tag_comparison_v3.py 5 years ago			`"headline": _ieer_read_text(m.group("headline"), root_label),`
readme check 5 years ago			`}`
			`else:`
			`return _ieer_read_text(s, root_label)`


			`def demo():`

			`s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."`
			`import nltk`

add tag_comparison_v3.py 5 years ago			`t = nltk.chunk.tagstr2tree(s, chunk_label="NP")`
readme check 5 years ago			`t.pprint()`
			`print()`

			`s = """`
			`These DT B-NP`
			`research NN I-NP`
			`protocols NNS I-NP`
			`offer VBP B-VP`
			`to TO B-PP`
			`the DT B-NP`
			`patient NN I-NP`
			`not RB O`
			`only RB O`
			`the DT B-NP`
			`very RB I-NP`
			`best JJS I-NP`
			`therapy NN I-NP`
			`which WDT B-NP`
			`we PRP B-NP`
			`have VBP B-VP`
			`established VBN I-VP`
			`today NN B-NP`
			`but CC B-NP`
			`also RB I-NP`
			`the DT B-NP`
			`hope NN I-NP`
			`of IN B-PP`
			`something NN B-NP`
			`still RB B-ADJP`
			`better JJR I-ADJP`
			`. . O`
			`"""`

add tag_comparison_v3.py 5 years ago			`conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))`
readme check 5 years ago			`conll_tree.pprint()`

			`# Demonstrate CoNLL output`
			`print("CoNLL output:")`
			`print(nltk.chunk.tree2conllstr(conll_tree))`
			`print()`


add tag_comparison_v3.py 5 years ago			`if __name__ == "__main__":`
readme check 5 years ago			`demo()`