bo-graduation/venv/lib/python3.7/site-packages/nltk/corpus/reader/propbank.py

# Natural Language Toolkit: PropBank Corpus Reader
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

import re
from functools import total_ordering
from xml.etree import ElementTree

from nltk.tree import Tree
from nltk.internals import raise_unorderable_types

from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *


class PropbankCorpusReader(CorpusReader):
    """
    Corpus reader for the propbank corpus, which augments the Penn
    Treebank with information about the predicate argument structure
    of every verb instance.  The corpus consists of two parts: the
    predicate-argument annotations themselves, and a set of "frameset
    files" which define the argument labels used by the annotations,
    on a per-verb basis.  Each "frameset file" contains one or more
    predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
    divided into coarse-grained word senses called "rolesets".  For
    each "roleset", the frameset file provides descriptions of the
    argument roles, along with examples.
    """

    def __init__(
        self,
        root,
        propfile,
        framefiles="",
        verbsfile=None,
        parse_fileid_xform=None,
        parse_corpus=None,
        encoding="utf8",
    ):
        """
        :param root: The root directory for this corpus.
        :param propfile: The name of the file containing the predicate-
            argument annotations (relative to ``root``).
        :param framefiles: A list or regexp specifying the frameset
            fileids for this corpus.
        :param parse_fileid_xform: A transform that should be applied
            to the fileids in this corpus.  This should be a function
            of one argument (a fileid) that returns a string (the new
            fileid).
        :param parse_corpus: The corpus containing the parse trees
            corresponding to this corpus.  These parse trees are
            necessary to resolve the tree pointers used by propbank.
        """
        # If framefiles is specified as a regexp, expand it.
        if isinstance(framefiles, str):
            framefiles = find_corpus_fileids(root, framefiles)
        framefiles = list(framefiles)
        # Initialze the corpus reader.
        CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)

        # Record our frame fileids & prop file.
        self._propfile = propfile
        self._framefiles = framefiles
        self._verbsfile = verbsfile
        self._parse_fileid_xform = parse_fileid_xform
        self._parse_corpus = parse_corpus

    def raw(self, fileids=None):
        """
        :return: the text contents of the given fileids, as a single string.
        """
        if fileids is None:
            fileids = self._fileids
        elif isinstance(fileids):
            fileids = [fileids]
        return concat([self.open(f).read() for f in fileids])

    def instances(self, baseform=None):
        """
        :return: a corpus view that acts as a list of
        ``PropBankInstance`` objects, one for each noun in the corpus.
        """
        kwargs = {}
        if baseform is not None:
            kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
        return StreamBackedCorpusView(
            self.abspath(self._propfile),
            lambda stream: self._read_instance_block(stream, **kwargs),
            encoding=self.encoding(self._propfile),
        )

    def lines(self):
        """
        :return: a corpus view that acts as a list of strings, one for
        each line in the predicate-argument annotation file.
        """
        return StreamBackedCorpusView(
            self.abspath(self._propfile),
            read_line_block,
            encoding=self.encoding(self._propfile),
        )

    def roleset(self, roleset_id):
        """
        :return: the xml description for the given roleset.
        """
        baseform = roleset_id.split(".")[0]
        framefile = "frames/%s.xml" % baseform
        if framefile not in self._framefiles:
            raise ValueError("Frameset file for %s not found" % roleset_id)

        # n.b.: The encoding for XML fileids is specified by the file
        # itself; so we ignore self._encoding here.
        etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
        for roleset in etree.findall("predicate/roleset"):
            if roleset.attrib["id"] == roleset_id:
                return roleset
        raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))

    def rolesets(self, baseform=None):
        """
        :return: list of xml descriptions for rolesets.
        """
        if baseform is not None:
            framefile = "frames/%s.xml" % baseform
            if framefile not in self._framefiles:
                raise ValueError("Frameset file for %s not found" % baseform)
            framefiles = [framefile]
        else:
            framefiles = self._framefiles

        rsets = []
        for framefile in framefiles:
            # n.b.: The encoding for XML fileids is specified by the file
            # itself; so we ignore self._encoding here.
            etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
            rsets.append(etree.findall("predicate/roleset"))
        return LazyConcatenation(rsets)

    def verbs(self):
        """
        :return: a corpus view that acts as a list of all verb lemmas
        in this corpus (from the verbs.txt file).
        """
        return StreamBackedCorpusView(
            self.abspath(self._verbsfile),
            read_line_block,
            encoding=self.encoding(self._verbsfile),
        )

    def _read_instance_block(self, stream, instance_filter=lambda inst: True):
        block = []

        # Read 100 at a time.
        for i in range(100):
            line = stream.readline().strip()
            if line:
                inst = PropbankInstance.parse(
                    line, self._parse_fileid_xform, self._parse_corpus
                )
                if instance_filter(inst):
                    block.append(inst)

        return block


######################################################################
# { Propbank Instance & related datatypes
######################################################################


class PropbankInstance(object):
    def __init__(
        self,
        fileid,
        sentnum,
        wordnum,
        tagger,
        roleset,
        inflection,
        predicate,
        arguments,
        parse_corpus=None,
    ):

        self.fileid = fileid
        """The name of the file containing the parse tree for this
        instance's sentence."""

        self.sentnum = sentnum
        """The sentence number of this sentence within ``fileid``.
        Indexing starts from zero."""

        self.wordnum = wordnum
        """The word number of this instance's predicate within its
        containing sentence.  Word numbers are indexed starting from
        zero, and include traces and other empty parse elements."""

        self.tagger = tagger
        """An identifier for the tagger who tagged this instance; or
        ``'gold'`` if this is an adjuticated instance."""

        self.roleset = roleset
        """The name of the roleset used by this instance's predicate.
        Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
        look up information about the roleset."""

        self.inflection = inflection
        """A ``PropbankInflection`` object describing the inflection of
        this instance's predicate."""

        self.predicate = predicate
        """A ``PropbankTreePointer`` indicating the position of this
        instance's predicate within its containing sentence."""

        self.arguments = tuple(arguments)
        """A list of tuples (argloc, argid), specifying the location
        and identifier for each of the predicate's argument in the
        containing sentence.  Argument identifiers are strings such as
        ``'ARG0'`` or ``'ARGM-TMP'``.  This list does *not* contain
        the predicate."""

        self.parse_corpus = parse_corpus
        """A corpus reader for the parse trees corresponding to the
        instances in this propbank corpus."""

    @property
    def baseform(self):
        """The baseform of the predicate."""
        return self.roleset.split(".")[0]

    @property
    def sensenumber(self):
        """The sense number of the predicate."""
        return self.roleset.split(".")[1]

    @property
    def predid(self):
        """Identifier of the predicate."""
        return "rel"

    def __repr__(self):
        return "<PropbankInstance: %s, sent %s, word %s>" % (
            self.fileid,
            self.sentnum,
            self.wordnum,
        )

    def __str__(self):
        s = "%s %s %s %s %s %s" % (
            self.fileid,
            self.sentnum,
            self.wordnum,
            self.tagger,
            self.roleset,
            self.inflection,
        )
        items = self.arguments + ((self.predicate, "rel"),)
        for (argloc, argid) in sorted(items):
            s += " %s-%s" % (argloc, argid)
        return s

    def _get_tree(self):
        if self.parse_corpus is None:
            return None
        if self.fileid not in self.parse_corpus.fileids():
            return None
        return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]

    tree = property(
        _get_tree,
        doc="""
        The parse tree corresponding to this instance, or None if
        the corresponding tree is not available.""",
    )

    @staticmethod
    def parse(s, parse_fileid_xform=None, parse_corpus=None):
        pieces = s.split()
        if len(pieces) < 7:
            raise ValueError("Badly formatted propbank line: %r" % s)

        # Divide the line into its basic pieces.
        (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
        rel = [p for p in pieces[6:] if p.endswith("-rel")]
        args = [p for p in pieces[6:] if not p.endswith("-rel")]
        if len(rel) != 1:
            raise ValueError("Badly formatted propbank line: %r" % s)

        # Apply the fileid selector, if any.
        if parse_fileid_xform is not None:
            fileid = parse_fileid_xform(fileid)

        # Convert sentence & word numbers to ints.
        sentnum = int(sentnum)
        wordnum = int(wordnum)

        # Parse the inflection
        inflection = PropbankInflection.parse(inflection)

        # Parse the predicate location.
        predicate = PropbankTreePointer.parse(rel[0][:-4])

        # Parse the arguments.
        arguments = []
        for arg in args:
            argloc, argid = arg.split("-", 1)
            arguments.append((PropbankTreePointer.parse(argloc), argid))

        # Put it all together.
        return PropbankInstance(
            fileid,
            sentnum,
            wordnum,
            tagger,
            roleset,
            inflection,
            predicate,
            arguments,
            parse_corpus,
        )


class PropbankPointer(object):
    """
    A pointer used by propbank to identify one or more constituents in
    a parse tree.  ``PropbankPointer`` is an abstract base class with
    three concrete subclasses:

      - ``PropbankTreePointer`` is used to point to single constituents.
      - ``PropbankSplitTreePointer`` is used to point to 'split'
        constituents, which consist of a sequence of two or more
        ``PropbankTreePointer`` pointers.
      - ``PropbankChainTreePointer`` is used to point to entire trace
        chains in a tree.  It consists of a sequence of pieces, which
        can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
    """

    def __init__(self):
        if self.__class__ == PropbankPointer:
            raise NotImplementedError()


class PropbankChainTreePointer(PropbankPointer):
    def __init__(self, pieces):
        self.pieces = pieces
        """A list of the pieces that make up this chain.  Elements may
           be either ``PropbankSplitTreePointer`` or
           ``PropbankTreePointer`` pointers."""

    def __str__(self):
        return "*".join("%s" % p for p in self.pieces)

    def __repr__(self):
        return "<PropbankChainTreePointer: %s>" % self

    def select(self, tree):
        if tree is None:
            raise ValueError("Parse tree not avaialable")
        return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])


class PropbankSplitTreePointer(PropbankPointer):
    def __init__(self, pieces):
        self.pieces = pieces
        """A list of the pieces that make up this chain.  Elements are
           all ``PropbankTreePointer`` pointers."""

    def __str__(self):
        return ",".join("%s" % p for p in self.pieces)

    def __repr__(self):
        return "<PropbankSplitTreePointer: %s>" % self

    def select(self, tree):
        if tree is None:
            raise ValueError("Parse tree not avaialable")
        return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])


@total_ordering

class PropbankTreePointer(PropbankPointer):
    """
    wordnum:height*wordnum:height*...
    wordnum:height,

    """

    def __init__(self, wordnum, height):
        self.wordnum = wordnum
        self.height = height

    @staticmethod
    def parse(s):
        # Deal with chains (xx*yy*zz)
        pieces = s.split("*")
        if len(pieces) > 1:
            return PropbankChainTreePointer(
                [PropbankTreePointer.parse(elt) for elt in pieces]
            )

        # Deal with split args (xx,yy,zz)
        pieces = s.split(",")
        if len(pieces) > 1:
            return PropbankSplitTreePointer(
                [PropbankTreePointer.parse(elt) for elt in pieces]
            )

        # Deal with normal pointers.
        pieces = s.split(":")
        if len(pieces) != 2:
            raise ValueError("bad propbank pointer %r" % s)
        return PropbankTreePointer(int(pieces[0]), int(pieces[1]))

    def __str__(self):
        return "%s:%s" % (self.wordnum, self.height)

    def __repr__(self):
        return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)

    def __eq__(self, other):
        while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
            other = other.pieces[0]

        if not isinstance(other, PropbankTreePointer):
            return self is other

        return self.wordnum == other.wordnum and self.height == other.height

    def __ne__(self, other):
        return not self == other

    def __lt__(self, other):
        while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
            other = other.pieces[0]

        if not isinstance(other, PropbankTreePointer):
            return id(self) < id(other)

        return (self.wordnum, -self.height) < (other.wordnum, -other.height)

    def select(self, tree):
        if tree is None:
            raise ValueError("Parse tree not avaialable")
        return tree[self.treepos(tree)]

    def treepos(self, tree):
        """
        Convert this pointer to a standard 'tree position' pointer,
        given that it points to the given tree.
        """
        if tree is None:
            raise ValueError("Parse tree not avaialable")
        stack = [tree]
        treepos = []

        wordnum = 0
        while True:
            # tree node:
            if isinstance(stack[-1], Tree):
                # Select the next child.
                if len(treepos) < len(stack):
                    treepos.append(0)
                else:
                    treepos[-1] += 1
                # Update the stack.
                if treepos[-1] < len(stack[-1]):
                    stack.append(stack[-1][treepos[-1]])
                else:
                    # End of node's child list: pop up a level.
                    stack.pop()
                    treepos.pop()
            # word node:
            else:
                if wordnum == self.wordnum:
                    return tuple(treepos[: len(treepos) - self.height - 1])
                else:
                    wordnum += 1
                    stack.pop()


class PropbankInflection(object):
    # { Inflection Form
    INFINITIVE = "i"
    GERUND = "g"
    PARTICIPLE = "p"
    FINITE = "v"
    # { Inflection Tense
    FUTURE = "f"
    PAST = "p"
    PRESENT = "n"
    # { Inflection Aspect
    PERFECT = "p"
    PROGRESSIVE = "o"
    PERFECT_AND_PROGRESSIVE = "b"
    # { Inflection Person
    THIRD_PERSON = "3"
    # { Inflection Voice
    ACTIVE = "a"
    PASSIVE = "p"
    # { Inflection
    NONE = "-"
    # }

    def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
        self.form = form
        self.tense = tense
        self.aspect = aspect
        self.person = person
        self.voice = voice

    def __str__(self):
        return self.form + self.tense + self.aspect + self.person + self.voice

    def __repr__(self):
        return "<PropbankInflection: %s>" % self

    _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")

    @staticmethod
    def parse(s):
        if not isinstance(s, str):
            raise TypeError("expected a string")
        if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
            raise ValueError("Bad propbank inflection string %r" % s)
        return PropbankInflection(*s)
readme check 4 years ago			`# Natural Language Toolkit: PropBank Corpus Reader`
			`#`
add tag_comparison_v3.py 4 years ago			`# Copyright (C) 2001-2020 NLTK Project`
readme check 4 years ago			`# Author: Edward Loper <edloper@gmail.com>`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`import re`
			`from functools import total_ordering`
			`from xml.etree import ElementTree`

			`from nltk.tree import Tree`
			`from nltk.internals import raise_unorderable_types`

			`from nltk.corpus.reader.util import *`
			`from nltk.corpus.reader.api import *`


			`class PropbankCorpusReader(CorpusReader):`
			`"""`
			`Corpus reader for the propbank corpus, which augments the Penn`
			`Treebank with information about the predicate argument structure`
			`of every verb instance. The corpus consists of two parts: the`
			`predicate-argument annotations themselves, and a set of "frameset`
			`files" which define the argument labels used by the annotations,`
			`on a per-verb basis. Each "frameset file" contains one or more`
			predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
			`divided into coarse-grained word senses called "rolesets". For`
			`each "roleset", the frameset file provides descriptions of the`
			`argument roles, along with examples.`
			`"""`

			`def __init__(`
			`self,`
			`root,`
			`propfile,`
add tag_comparison_v3.py 4 years ago			`framefiles="",`
readme check 4 years ago			`verbsfile=None,`
			`parse_fileid_xform=None,`
			`parse_corpus=None,`
add tag_comparison_v3.py 4 years ago			`encoding="utf8",`
readme check 4 years ago			`):`
			`"""`
			`:param root: The root directory for this corpus.`
			`:param propfile: The name of the file containing the predicate-`
			argument annotations (relative to ``root``).
			`:param framefiles: A list or regexp specifying the frameset`
			`fileids for this corpus.`
			`:param parse_fileid_xform: A transform that should be applied`
			`to the fileids in this corpus. This should be a function`
			`of one argument (a fileid) that returns a string (the new`
			`fileid).`
			`:param parse_corpus: The corpus containing the parse trees`
			`corresponding to this corpus. These parse trees are`
			`necessary to resolve the tree pointers used by propbank.`
			`"""`
			`# If framefiles is specified as a regexp, expand it.`
add tag_comparison_v3.py 4 years ago			`if isinstance(framefiles, str):`
readme check 4 years ago			`framefiles = find_corpus_fileids(root, framefiles)`
			`framefiles = list(framefiles)`
			`# Initialze the corpus reader.`
			`CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)`

			`# Record our frame fileids & prop file.`
			`self._propfile = propfile`
			`self._framefiles = framefiles`
			`self._verbsfile = verbsfile`
			`self._parse_fileid_xform = parse_fileid_xform`
			`self._parse_corpus = parse_corpus`

			`def raw(self, fileids=None):`
			`"""`
			`:return: the text contents of the given fileids, as a single string.`
			`"""`
			`if fileids is None:`
			`fileids = self._fileids`
			`elif isinstance(fileids):`
			`fileids = [fileids]`
			`return concat([self.open(f).read() for f in fileids])`

			`def instances(self, baseform=None):`
			`"""`
			`:return: a corpus view that acts as a list of`
			``PropBankInstance`` objects, one for each noun in the corpus.
			`"""`
			`kwargs = {}`
			`if baseform is not None:`
add tag_comparison_v3.py 4 years ago			`kwargs["instance_filter"] = lambda inst: inst.baseform == baseform`
readme check 4 years ago			`return StreamBackedCorpusView(`
			`self.abspath(self._propfile),`
			`lambda stream: self._read_instance_block(stream, **kwargs),`
			`encoding=self.encoding(self._propfile),`
			`)`

			`def lines(self):`
			`"""`
			`:return: a corpus view that acts as a list of strings, one for`
			`each line in the predicate-argument annotation file.`
			`"""`
			`return StreamBackedCorpusView(`
			`self.abspath(self._propfile),`
			`read_line_block,`
			`encoding=self.encoding(self._propfile),`
			`)`

			`def roleset(self, roleset_id):`
			`"""`
			`:return: the xml description for the given roleset.`
			`"""`
add tag_comparison_v3.py 4 years ago			`baseform = roleset_id.split(".")[0]`
			`framefile = "frames/%s.xml" % baseform`
readme check 4 years ago			`if framefile not in self._framefiles:`
add tag_comparison_v3.py 4 years ago			`raise ValueError("Frameset file for %s not found" % roleset_id)`
readme check 4 years ago
			`# n.b.: The encoding for XML fileids is specified by the file`
			`# itself; so we ignore self._encoding here.`
			`etree = ElementTree.parse(self.abspath(framefile).open()).getroot()`
add tag_comparison_v3.py 4 years ago			`for roleset in etree.findall("predicate/roleset"):`
			`if roleset.attrib["id"] == roleset_id:`
readme check 4 years ago			`return roleset`
add tag_comparison_v3.py 4 years ago			`raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))`
readme check 4 years ago
			`def rolesets(self, baseform=None):`
			`"""`
			`:return: list of xml descriptions for rolesets.`
			`"""`
			`if baseform is not None:`
add tag_comparison_v3.py 4 years ago			`framefile = "frames/%s.xml" % baseform`
readme check 4 years ago			`if framefile not in self._framefiles:`
add tag_comparison_v3.py 4 years ago			`raise ValueError("Frameset file for %s not found" % baseform)`
readme check 4 years ago			`framefiles = [framefile]`
			`else:`
			`framefiles = self._framefiles`

			`rsets = []`
			`for framefile in framefiles:`
			`# n.b.: The encoding for XML fileids is specified by the file`
			`# itself; so we ignore self._encoding here.`
			`etree = ElementTree.parse(self.abspath(framefile).open()).getroot()`
add tag_comparison_v3.py 4 years ago			`rsets.append(etree.findall("predicate/roleset"))`
readme check 4 years ago			`return LazyConcatenation(rsets)`

			`def verbs(self):`
			`"""`
			`:return: a corpus view that acts as a list of all verb lemmas`
			`in this corpus (from the verbs.txt file).`
			`"""`
			`return StreamBackedCorpusView(`
			`self.abspath(self._verbsfile),`
			`read_line_block,`
			`encoding=self.encoding(self._verbsfile),`
			`)`

			`def _read_instance_block(self, stream, instance_filter=lambda inst: True):`
			`block = []`

			`# Read 100 at a time.`
			`for i in range(100):`
			`line = stream.readline().strip()`
			`if line:`
			`inst = PropbankInstance.parse(`
			`line, self._parse_fileid_xform, self._parse_corpus`
			`)`
			`if instance_filter(inst):`
			`block.append(inst)`

			`return block`


			`######################################################################`
			`# { Propbank Instance & related datatypes`
			`######################################################################`


add tag_comparison_v3.py 4 years ago
readme check 4 years ago			`class PropbankInstance(object):`
			`def __init__(`
			`self,`
			`fileid,`
			`sentnum,`
			`wordnum,`
			`tagger,`
			`roleset,`
			`inflection,`
			`predicate,`
			`arguments,`
			`parse_corpus=None,`
			`):`

			`self.fileid = fileid`
			`"""The name of the file containing the parse tree for this`
			`instance's sentence."""`

			`self.sentnum = sentnum`
			"""The sentence number of this sentence within ``fileid``.
			`Indexing starts from zero."""`

			`self.wordnum = wordnum`
			`"""The word number of this instance's predicate within its`
			`containing sentence. Word numbers are indexed starting from`
			`zero, and include traces and other empty parse elements."""`

			`self.tagger = tagger`
			`"""An identifier for the tagger who tagged this instance; or`
			``'gold'`` if this is an adjuticated instance."""

			`self.roleset = roleset`
			`"""The name of the roleset used by this instance's predicate.`
			Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
			`look up information about the roleset."""`

			`self.inflection = inflection`
			"""A ``PropbankInflection`` object describing the inflection of
			`this instance's predicate."""`

			`self.predicate = predicate`
			"""A ``PropbankTreePointer`` indicating the position of this
			`instance's predicate within its containing sentence."""`

			`self.arguments = tuple(arguments)`
			`"""A list of tuples (argloc, argid), specifying the location`
			`and identifier for each of the predicate's argument in the`
			`containing sentence. Argument identifiers are strings such as`
			``'ARG0'`` or ``'ARGM-TMP'``. This list does not contain
			`the predicate."""`

			`self.parse_corpus = parse_corpus`
			`"""A corpus reader for the parse trees corresponding to the`
			`instances in this propbank corpus."""`

			`@property`
			`def baseform(self):`
			`"""The baseform of the predicate."""`
add tag_comparison_v3.py 4 years ago			`return self.roleset.split(".")[0]`
readme check 4 years ago
			`@property`
			`def sensenumber(self):`
			`"""The sense number of the predicate."""`
add tag_comparison_v3.py 4 years ago			`return self.roleset.split(".")[1]`
readme check 4 years ago
			`@property`
			`def predid(self):`
			`"""Identifier of the predicate."""`
add tag_comparison_v3.py 4 years ago			`return "rel"`
readme check 4 years ago
			`def __repr__(self):`
add tag_comparison_v3.py 4 years ago			`return "<PropbankInstance: %s, sent %s, word %s>" % (`
readme check 4 years ago			`self.fileid,`
			`self.sentnum,`
			`self.wordnum,`
			`)`

			`def __str__(self):`
add tag_comparison_v3.py 4 years ago			`s = "%s %s %s %s %s %s" % (`
readme check 4 years ago			`self.fileid,`
			`self.sentnum,`
			`self.wordnum,`
			`self.tagger,`
			`self.roleset,`
			`self.inflection,`
			`)`
add tag_comparison_v3.py 4 years ago			`items = self.arguments + ((self.predicate, "rel"),)`
readme check 4 years ago			`for (argloc, argid) in sorted(items):`
add tag_comparison_v3.py 4 years ago			`s += " %s-%s" % (argloc, argid)`
readme check 4 years ago			`return s`

			`def _get_tree(self):`
			`if self.parse_corpus is None:`
			`return None`
			`if self.fileid not in self.parse_corpus.fileids():`
			`return None`
			`return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]`

			`tree = property(`
			`_get_tree,`
			`doc="""`
			`The parse tree corresponding to this instance, or None if`
			`the corresponding tree is not available.""",`
			`)`

			`@staticmethod`
			`def parse(s, parse_fileid_xform=None, parse_corpus=None):`
			`pieces = s.split()`
			`if len(pieces) < 7:`
add tag_comparison_v3.py 4 years ago			`raise ValueError("Badly formatted propbank line: %r" % s)`
readme check 4 years ago
			`# Divide the line into its basic pieces.`
			`(fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]`
add tag_comparison_v3.py 4 years ago			`rel = [p for p in pieces[6:] if p.endswith("-rel")]`
			`args = [p for p in pieces[6:] if not p.endswith("-rel")]`
readme check 4 years ago			`if len(rel) != 1:`
add tag_comparison_v3.py 4 years ago			`raise ValueError("Badly formatted propbank line: %r" % s)`
readme check 4 years ago
			`# Apply the fileid selector, if any.`
			`if parse_fileid_xform is not None:`
			`fileid = parse_fileid_xform(fileid)`

			`# Convert sentence & word numbers to ints.`
			`sentnum = int(sentnum)`
			`wordnum = int(wordnum)`

			`# Parse the inflection`
			`inflection = PropbankInflection.parse(inflection)`

			`# Parse the predicate location.`
			`predicate = PropbankTreePointer.parse(rel[0][:-4])`

			`# Parse the arguments.`
			`arguments = []`
			`for arg in args:`
add tag_comparison_v3.py 4 years ago			`argloc, argid = arg.split("-", 1)`
readme check 4 years ago			`arguments.append((PropbankTreePointer.parse(argloc), argid))`

			`# Put it all together.`
			`return PropbankInstance(`
			`fileid,`
			`sentnum,`
			`wordnum,`
			`tagger,`
			`roleset,`
			`inflection,`
			`predicate,`
			`arguments,`
			`parse_corpus,`
			`)`


			`class PropbankPointer(object):`
			`"""`
			`A pointer used by propbank to identify one or more constituents in`
			a parse tree. ``PropbankPointer`` is an abstract base class with
			`three concrete subclasses:`

			- ``PropbankTreePointer`` is used to point to single constituents.
			- ``PropbankSplitTreePointer`` is used to point to 'split'
			`constituents, which consist of a sequence of two or more`
			``PropbankTreePointer`` pointers.
			- ``PropbankChainTreePointer`` is used to point to entire trace
			`chains in a tree. It consists of a sequence of pieces, which`
			can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
			`"""`

			`def __init__(self):`
			`if self.__class__ == PropbankPointer:`
			`raise NotImplementedError()`


add tag_comparison_v3.py 4 years ago
readme check 4 years ago			`class PropbankChainTreePointer(PropbankPointer):`
			`def __init__(self, pieces):`
			`self.pieces = pieces`
			`"""A list of the pieces that make up this chain. Elements may`
			be either ``PropbankSplitTreePointer`` or
			``PropbankTreePointer`` pointers."""

			`def __str__(self):`
add tag_comparison_v3.py 4 years ago			`return "*".join("%s" % p for p in self.pieces)`
readme check 4 years ago
			`def __repr__(self):`
add tag_comparison_v3.py 4 years ago			`return "<PropbankChainTreePointer: %s>" % self`
readme check 4 years ago
			`def select(self, tree):`
			`if tree is None:`
add tag_comparison_v3.py 4 years ago			`raise ValueError("Parse tree not avaialable")`
			`return Tree("CHAIN", [p.select(tree) for p in self.pieces])`

readme check 4 years ago

			`class PropbankSplitTreePointer(PropbankPointer):`
			`def __init__(self, pieces):`
			`self.pieces = pieces`
			`"""A list of the pieces that make up this chain. Elements are`
			all ``PropbankTreePointer`` pointers."""

			`def __str__(self):`
add tag_comparison_v3.py 4 years ago			`return ",".join("%s" % p for p in self.pieces)`
readme check 4 years ago
			`def __repr__(self):`
add tag_comparison_v3.py 4 years ago			`return "<PropbankSplitTreePointer: %s>" % self`
readme check 4 years ago
			`def select(self, tree):`
			`if tree is None:`
add tag_comparison_v3.py 4 years ago			`raise ValueError("Parse tree not avaialable")`
			`return Tree("SPLIT", [p.select(tree) for p in self.pieces])`
readme check 4 years ago

			`@total_ordering`
add tag_comparison_v3.py 4 years ago
readme check 4 years ago			`class PropbankTreePointer(PropbankPointer):`
			`"""`
			`wordnum:heightwordnum:height...`
			`wordnum:height,`

			`"""`

			`def __init__(self, wordnum, height):`
			`self.wordnum = wordnum`
			`self.height = height`

			`@staticmethod`
			`def parse(s):`
			`# Deal with chains (xxyyzz)`
add tag_comparison_v3.py 4 years ago			`pieces = s.split("*")`
readme check 4 years ago			`if len(pieces) > 1:`
			`return PropbankChainTreePointer(`
			`[PropbankTreePointer.parse(elt) for elt in pieces]`
			`)`

			`# Deal with split args (xx,yy,zz)`
add tag_comparison_v3.py 4 years ago			`pieces = s.split(",")`
readme check 4 years ago			`if len(pieces) > 1:`
			`return PropbankSplitTreePointer(`
			`[PropbankTreePointer.parse(elt) for elt in pieces]`
			`)`

			`# Deal with normal pointers.`
add tag_comparison_v3.py 4 years ago			`pieces = s.split(":")`
readme check 4 years ago			`if len(pieces) != 2:`
add tag_comparison_v3.py 4 years ago			`raise ValueError("bad propbank pointer %r" % s)`
readme check 4 years ago			`return PropbankTreePointer(int(pieces[0]), int(pieces[1]))`

			`def __str__(self):`
add tag_comparison_v3.py 4 years ago			`return "%s:%s" % (self.wordnum, self.height)`
readme check 4 years ago
			`def __repr__(self):`
add tag_comparison_v3.py 4 years ago			`return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)`
readme check 4 years ago
			`def __eq__(self, other):`
			`while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):`
			`other = other.pieces[0]`

			`if not isinstance(other, PropbankTreePointer):`
			`return self is other`

			`return self.wordnum == other.wordnum and self.height == other.height`

			`def __ne__(self, other):`
			`return not self == other`

			`def __lt__(self, other):`
			`while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):`
			`other = other.pieces[0]`

			`if not isinstance(other, PropbankTreePointer):`
			`return id(self) < id(other)`

			`return (self.wordnum, -self.height) < (other.wordnum, -other.height)`

			`def select(self, tree):`
			`if tree is None:`
add tag_comparison_v3.py 4 years ago			`raise ValueError("Parse tree not avaialable")`
readme check 4 years ago			`return tree[self.treepos(tree)]`

			`def treepos(self, tree):`
			`"""`
			`Convert this pointer to a standard 'tree position' pointer,`
			`given that it points to the given tree.`
			`"""`
			`if tree is None:`
add tag_comparison_v3.py 4 years ago			`raise ValueError("Parse tree not avaialable")`
readme check 4 years ago			`stack = [tree]`
			`treepos = []`

			`wordnum = 0`
			`while True:`
			`# tree node:`
			`if isinstance(stack[-1], Tree):`
			`# Select the next child.`
			`if len(treepos) < len(stack):`
			`treepos.append(0)`
			`else:`
			`treepos[-1] += 1`
			`# Update the stack.`
			`if treepos[-1] < len(stack[-1]):`
			`stack.append(stack[-1][treepos[-1]])`
			`else:`
			`# End of node's child list: pop up a level.`
			`stack.pop()`
			`treepos.pop()`
			`# word node:`
			`else:`
			`if wordnum == self.wordnum:`
			`return tuple(treepos[: len(treepos) - self.height - 1])`
			`else:`
			`wordnum += 1`
			`stack.pop()`


add tag_comparison_v3.py 4 years ago
readme check 4 years ago			`class PropbankInflection(object):`
			`# { Inflection Form`
add tag_comparison_v3.py 4 years ago			`INFINITIVE = "i"`
			`GERUND = "g"`
			`PARTICIPLE = "p"`
			`FINITE = "v"`
readme check 4 years ago			`# { Inflection Tense`
add tag_comparison_v3.py 4 years ago			`FUTURE = "f"`
			`PAST = "p"`
			`PRESENT = "n"`
readme check 4 years ago			`# { Inflection Aspect`
add tag_comparison_v3.py 4 years ago			`PERFECT = "p"`
			`PROGRESSIVE = "o"`
			`PERFECT_AND_PROGRESSIVE = "b"`
readme check 4 years ago			`# { Inflection Person`
add tag_comparison_v3.py 4 years ago			`THIRD_PERSON = "3"`
readme check 4 years ago			`# { Inflection Voice`
add tag_comparison_v3.py 4 years ago			`ACTIVE = "a"`
			`PASSIVE = "p"`
readme check 4 years ago			`# { Inflection`
add tag_comparison_v3.py 4 years ago			`NONE = "-"`
readme check 4 years ago			`# }`

add tag_comparison_v3.py 4 years ago			`def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):`
readme check 4 years ago			`self.form = form`
			`self.tense = tense`
			`self.aspect = aspect`
			`self.person = person`
			`self.voice = voice`

			`def __str__(self):`
			`return self.form + self.tense + self.aspect + self.person + self.voice`

			`def __repr__(self):`
add tag_comparison_v3.py 4 years ago			`return "<PropbankInflection: %s>" % self`
readme check 4 years ago
add tag_comparison_v3.py 4 years ago			`_VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")`
readme check 4 years ago
			`@staticmethod`
			`def parse(s):`
add tag_comparison_v3.py 4 years ago			`if not isinstance(s, str):`
			`raise TypeError("expected a string")`
readme check 4 years ago			`if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):`
add tag_comparison_v3.py 4 years ago			`raise ValueError("Bad propbank inflection string %r" % s)`
readme check 4 years ago			`return PropbankInflection(*s)`