You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

535 lines
17 KiB
Python

# Natural Language Toolkit: PropBank Corpus Reader
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import re
from functools import total_ordering
from xml.etree import ElementTree
from nltk.tree import Tree
from nltk.internals import raise_unorderable_types
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class PropbankCorpusReader(CorpusReader):
"""
Corpus reader for the propbank corpus, which augments the Penn
Treebank with information about the predicate argument structure
of every verb instance. The corpus consists of two parts: the
predicate-argument annotations themselves, and a set of "frameset
files" which define the argument labels used by the annotations,
on a per-verb basis. Each "frameset file" contains one or more
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
divided into coarse-grained word senses called "rolesets". For
each "roleset", the frameset file provides descriptions of the
argument roles, along with examples.
"""
def __init__(
self,
root,
propfile,
framefiles="",
verbsfile=None,
parse_fileid_xform=None,
parse_corpus=None,
encoding="utf8",
):
"""
:param root: The root directory for this corpus.
:param propfile: The name of the file containing the predicate-
argument annotations (relative to ``root``).
:param framefiles: A list or regexp specifying the frameset
fileids for this corpus.
:param parse_fileid_xform: A transform that should be applied
to the fileids in this corpus. This should be a function
of one argument (a fileid) that returns a string (the new
fileid).
:param parse_corpus: The corpus containing the parse trees
corresponding to this corpus. These parse trees are
necessary to resolve the tree pointers used by propbank.
"""
# If framefiles is specified as a regexp, expand it.
if isinstance(framefiles, str):
framefiles = find_corpus_fileids(root, framefiles)
framefiles = list(framefiles)
# Initialze the corpus reader.
CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
# Record our frame fileids & prop file.
self._propfile = propfile
self._framefiles = framefiles
self._verbsfile = verbsfile
self._parse_fileid_xform = parse_fileid_xform
self._parse_corpus = parse_corpus
def raw(self, fileids=None):
"""
:return: the text contents of the given fileids, as a single string.
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def instances(self, baseform=None):
"""
:return: a corpus view that acts as a list of
``PropBankInstance`` objects, one for each noun in the corpus.
"""
kwargs = {}
if baseform is not None:
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
return StreamBackedCorpusView(
self.abspath(self._propfile),
lambda stream: self._read_instance_block(stream, **kwargs),
encoding=self.encoding(self._propfile),
)
def lines(self):
"""
:return: a corpus view that acts as a list of strings, one for
each line in the predicate-argument annotation file.
"""
return StreamBackedCorpusView(
self.abspath(self._propfile),
read_line_block,
encoding=self.encoding(self._propfile),
)
def roleset(self, roleset_id):
"""
:return: the xml description for the given roleset.
"""
baseform = roleset_id.split(".")[0]
framefile = "frames/%s.xml" % baseform
if framefile not in self._framefiles:
raise ValueError("Frameset file for %s not found" % roleset_id)
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
for roleset in etree.findall("predicate/roleset"):
if roleset.attrib["id"] == roleset_id:
return roleset
raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))
def rolesets(self, baseform=None):
"""
:return: list of xml descriptions for rolesets.
"""
if baseform is not None:
framefile = "frames/%s.xml" % baseform
if framefile not in self._framefiles:
raise ValueError("Frameset file for %s not found" % baseform)
framefiles = [framefile]
else:
framefiles = self._framefiles
rsets = []
for framefile in framefiles:
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
rsets.append(etree.findall("predicate/roleset"))
return LazyConcatenation(rsets)
def verbs(self):
"""
:return: a corpus view that acts as a list of all verb lemmas
in this corpus (from the verbs.txt file).
"""
return StreamBackedCorpusView(
self.abspath(self._verbsfile),
read_line_block,
encoding=self.encoding(self._verbsfile),
)
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
block = []
# Read 100 at a time.
for i in range(100):
line = stream.readline().strip()
if line:
inst = PropbankInstance.parse(
line, self._parse_fileid_xform, self._parse_corpus
)
if instance_filter(inst):
block.append(inst)
return block
######################################################################
# { Propbank Instance & related datatypes
######################################################################
class PropbankInstance(object):
def __init__(
self,
fileid,
sentnum,
wordnum,
tagger,
roleset,
inflection,
predicate,
arguments,
parse_corpus=None,
):
self.fileid = fileid
"""The name of the file containing the parse tree for this
instance's sentence."""
self.sentnum = sentnum
"""The sentence number of this sentence within ``fileid``.
Indexing starts from zero."""
self.wordnum = wordnum
"""The word number of this instance's predicate within its
containing sentence. Word numbers are indexed starting from
zero, and include traces and other empty parse elements."""
self.tagger = tagger
"""An identifier for the tagger who tagged this instance; or
``'gold'`` if this is an adjuticated instance."""
self.roleset = roleset
"""The name of the roleset used by this instance's predicate.
Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
look up information about the roleset."""
self.inflection = inflection
"""A ``PropbankInflection`` object describing the inflection of
this instance's predicate."""
self.predicate = predicate
"""A ``PropbankTreePointer`` indicating the position of this
instance's predicate within its containing sentence."""
self.arguments = tuple(arguments)
"""A list of tuples (argloc, argid), specifying the location
and identifier for each of the predicate's argument in the
containing sentence. Argument identifiers are strings such as
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
the predicate."""
self.parse_corpus = parse_corpus
"""A corpus reader for the parse trees corresponding to the
instances in this propbank corpus."""
@property
def baseform(self):
"""The baseform of the predicate."""
return self.roleset.split(".")[0]
@property
def sensenumber(self):
"""The sense number of the predicate."""
return self.roleset.split(".")[1]
@property
def predid(self):
"""Identifier of the predicate."""
return "rel"
def __repr__(self):
return "<PropbankInstance: %s, sent %s, word %s>" % (
self.fileid,
self.sentnum,
self.wordnum,
)
def __str__(self):
s = "%s %s %s %s %s %s" % (
self.fileid,
self.sentnum,
self.wordnum,
self.tagger,
self.roleset,
self.inflection,
)
items = self.arguments + ((self.predicate, "rel"),)
for (argloc, argid) in sorted(items):
s += " %s-%s" % (argloc, argid)
return s
def _get_tree(self):
if self.parse_corpus is None:
return None
if self.fileid not in self.parse_corpus.fileids():
return None
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
tree = property(
_get_tree,
doc="""
The parse tree corresponding to this instance, or None if
the corresponding tree is not available.""",
)
@staticmethod
def parse(s, parse_fileid_xform=None, parse_corpus=None):
pieces = s.split()
if len(pieces) < 7:
raise ValueError("Badly formatted propbank line: %r" % s)
# Divide the line into its basic pieces.
(fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
rel = [p for p in pieces[6:] if p.endswith("-rel")]
args = [p for p in pieces[6:] if not p.endswith("-rel")]
if len(rel) != 1:
raise ValueError("Badly formatted propbank line: %r" % s)
# Apply the fileid selector, if any.
if parse_fileid_xform is not None:
fileid = parse_fileid_xform(fileid)
# Convert sentence & word numbers to ints.
sentnum = int(sentnum)
wordnum = int(wordnum)
# Parse the inflection
inflection = PropbankInflection.parse(inflection)
# Parse the predicate location.
predicate = PropbankTreePointer.parse(rel[0][:-4])
# Parse the arguments.
arguments = []
for arg in args:
argloc, argid = arg.split("-", 1)
arguments.append((PropbankTreePointer.parse(argloc), argid))
# Put it all together.
return PropbankInstance(
fileid,
sentnum,
wordnum,
tagger,
roleset,
inflection,
predicate,
arguments,
parse_corpus,
)
class PropbankPointer(object):
"""
A pointer used by propbank to identify one or more constituents in
a parse tree. ``PropbankPointer`` is an abstract base class with
three concrete subclasses:
- ``PropbankTreePointer`` is used to point to single constituents.
- ``PropbankSplitTreePointer`` is used to point to 'split'
constituents, which consist of a sequence of two or more
``PropbankTreePointer`` pointers.
- ``PropbankChainTreePointer`` is used to point to entire trace
chains in a tree. It consists of a sequence of pieces, which
can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
"""
def __init__(self):
if self.__class__ == PropbankPointer:
raise NotImplementedError()
class PropbankChainTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements may
be either ``PropbankSplitTreePointer`` or
``PropbankTreePointer`` pointers."""
def __str__(self):
return "*".join("%s" % p for p in self.pieces)
def __repr__(self):
return "<PropbankChainTreePointer: %s>" % self
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not avaialable")
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
class PropbankSplitTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements are
all ``PropbankTreePointer`` pointers."""
def __str__(self):
return ",".join("%s" % p for p in self.pieces)
def __repr__(self):
return "<PropbankSplitTreePointer: %s>" % self
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not avaialable")
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
@total_ordering
class PropbankTreePointer(PropbankPointer):
"""
wordnum:height*wordnum:height*...
wordnum:height,
"""
def __init__(self, wordnum, height):
self.wordnum = wordnum
self.height = height
@staticmethod
def parse(s):
# Deal with chains (xx*yy*zz)
pieces = s.split("*")
if len(pieces) > 1:
return PropbankChainTreePointer(
[PropbankTreePointer.parse(elt) for elt in pieces]
)
# Deal with split args (xx,yy,zz)
pieces = s.split(",")
if len(pieces) > 1:
return PropbankSplitTreePointer(
[PropbankTreePointer.parse(elt) for elt in pieces]
)
# Deal with normal pointers.
pieces = s.split(":")
if len(pieces) != 2:
raise ValueError("bad propbank pointer %r" % s)
return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
def __str__(self):
return "%s:%s" % (self.wordnum, self.height)
def __repr__(self):
return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
def __eq__(self, other):
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, PropbankTreePointer):
return self is other
return self.wordnum == other.wordnum and self.height == other.height
def __ne__(self, other):
return not self == other
def __lt__(self, other):
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, PropbankTreePointer):
return id(self) < id(other)
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
def select(self, tree):
if tree is None:
raise ValueError("Parse tree not avaialable")
return tree[self.treepos(tree)]
def treepos(self, tree):
"""
Convert this pointer to a standard 'tree position' pointer,
given that it points to the given tree.
"""
if tree is None:
raise ValueError("Parse tree not avaialable")
stack = [tree]
treepos = []
wordnum = 0
while True:
# tree node:
if isinstance(stack[-1], Tree):
# Select the next child.
if len(treepos) < len(stack):
treepos.append(0)
else:
treepos[-1] += 1
# Update the stack.
if treepos[-1] < len(stack[-1]):
stack.append(stack[-1][treepos[-1]])
else:
# End of node's child list: pop up a level.
stack.pop()
treepos.pop()
# word node:
else:
if wordnum == self.wordnum:
return tuple(treepos[: len(treepos) - self.height - 1])
else:
wordnum += 1
stack.pop()
class PropbankInflection(object):
# { Inflection Form
INFINITIVE = "i"
GERUND = "g"
PARTICIPLE = "p"
FINITE = "v"
# { Inflection Tense
FUTURE = "f"
PAST = "p"
PRESENT = "n"
# { Inflection Aspect
PERFECT = "p"
PROGRESSIVE = "o"
PERFECT_AND_PROGRESSIVE = "b"
# { Inflection Person
THIRD_PERSON = "3"
# { Inflection Voice
ACTIVE = "a"
PASSIVE = "p"
# { Inflection
NONE = "-"
# }
def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
self.form = form
self.tense = tense
self.aspect = aspect
self.person = person
self.voice = voice
def __str__(self):
return self.form + self.tense + self.aspect + self.person + self.voice
def __repr__(self):
return "<PropbankInflection: %s>" % self
_VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
@staticmethod
def parse(s):
if not isinstance(s, str):
raise TypeError("expected a string")
if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
raise ValueError("Bad propbank inflection string %r" % s)
return PropbankInflection(*s)