You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

298 lines
11 KiB
Python

# Natural Language Toolkit: SemCor Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the SemCor Corpus.
"""
from __future__ import absolute_import, unicode_literals
__docformat__ = 'epytext en'
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
from nltk.tree import Tree
class SemcorCorpusReader(XMLCorpusReader):
"""
Corpus reader for the SemCor Corpus.
For access to the complete XML data structure, use the ``xml()``
method. For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
"""
def __init__(self, root, fileids, wordnet, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
self._wordnet = wordnet
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return self._items(fileids, 'word', False, False, False)
def chunks(self, fileids=None):
"""
:return: the given file(s) as a list of chunks,
each of which is a list of words and punctuation symbols
that form a unit.
:rtype: list(list(str))
"""
return self._items(fileids, 'chunk', False, False, False)
def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')):
"""
:return: the given file(s) as a list of tagged chunks, represented
in tree form.
:rtype: list(Tree)
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
to indicate the kind of tags to include. Semantic tags consist of
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
without a specific entry in WordNet. (Named entities of type 'other'
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
return self._items(fileids, 'chunk', False, tag != 'sem', tag != 'pos')
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of sentences, each encoded
as a list of word strings.
:rtype: list(list(str))
"""
return self._items(fileids, 'word', True, False, False)
def chunk_sents(self, fileids=None):
"""
:return: the given file(s) as a list of sentences, each encoded
as a list of chunks.
:rtype: list(list(list(str)))
"""
return self._items(fileids, 'chunk', True, False, False)
def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')):
"""
:return: the given file(s) as a list of sentences. Each sentence
is represented as a list of tagged chunks (in tree form).
:rtype: list(list(Tree))
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
to indicate the kind of tags to include. Semantic tags consist of
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
without a specific entry in WordNet. (Named entities of type 'other'
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
return self._items(fileids, 'chunk', True, tag != 'sem', tag != 'pos')
def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
if unit == 'word' and not bracket_sent:
# the result of the SemcorWordView may be a multiword unit, so the
# LazyConcatenation will make sure the sentence is flattened
_ = lambda *args: LazyConcatenation(
(SemcorWordView if self._lazy else self._words)(*args)
)
else:
_ = SemcorWordView if self._lazy else self._words
return concat(
[
_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
for fileid in self.abspaths(fileids)
]
)
def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
"""
Helper used to implement the view methods -- returns a list of
tokens, (segmented) words, chunks, or sentences. The tokens
and chunks may optionally be tagged (with POS and sense
information).
:param fileid: The name of the underlying file.
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
:param bracket_sent: If true, include sentence bracketing.
:param pos_tag: Whether to include part-of-speech tags.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
assert unit in ('token', 'word', 'chunk')
result = []
xmldoc = ElementTree.parse(fileid).getroot()
for xmlsent in xmldoc.findall('.//s'):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
itm = SemcorCorpusReader._word(
xmlword, unit, pos_tag, sem_tag, self._wordnet
)
if unit == 'word':
sent.extend(itm)
else:
sent.append(itm)
if bracket_sent:
result.append(SemcorSentence(xmlsent.attrib['snum'], sent))
else:
result.extend(sent)
assert None not in result
return result
@staticmethod
def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
tkn = xmlword.text
if not tkn:
tkn = "" # fixes issue 337?
lemma = xmlword.get('lemma', tkn) # lemma or NE class
lexsn = xmlword.get('lexsn') # lex_sense (locator for the lemma's sense)
if lexsn is not None:
sense_key = lemma + '%' + lexsn
wnpos = ('n', 'v', 'a', 'r', 's')[
int(lexsn.split(':')[0]) - 1
] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
else:
sense_key = wnpos = None
redef = xmlword.get(
'rdf', tkn
) # redefinition--this indicates the lookup string
# does not exactly match the enclosed string, e.g. due to typographical adjustments
# or discontinuity of a multiword expression. If a redefinition has occurred,
# the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
sensenum = xmlword.get('wnsn') # WordNet sense number
isOOVEntity = 'pn' in xmlword.keys() # a "personal name" (NE) not in WordNet
pos = xmlword.get(
'pos'
) # part of speech for the whole chunk (None for punctuation)
if unit == 'token':
if not pos_tag and not sem_tag:
itm = tkn
else:
itm = (
(tkn,)
+ ((pos,) if pos_tag else ())
+ ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
)
return itm
else:
ww = tkn.split('_') # TODO: case where punctuation intervenes in MWE
if unit == 'word':
return ww
else:
if sensenum is not None:
try:
sense = wordnet.lemma_from_key(sense_key) # Lemma object
except Exception:
# cannot retrieve the wordnet.Lemma object. possible reasons:
# (a) the wordnet corpus is not downloaded;
# (b) a nonexistant sense is annotated: e.g., such.s.00 triggers:
# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
# solution: just use the lemma name as a string
try:
sense = '%s.%s.%02d' % (
lemma,
wnpos,
int(sensenum),
) # e.g.: reach.v.02
except ValueError:
sense = (
lemma + '.' + wnpos + '.' + sensenum
) # e.g. the sense number may be "2;1"
bottom = [Tree(pos, ww)] if pos_tag else ww
if sem_tag and isOOVEntity:
if sensenum is not None:
return Tree(sense, [Tree('NE', bottom)])
else: # 'other' NE
return Tree('NE', bottom)
elif sem_tag and sensenum is not None:
return Tree(sense, bottom)
elif pos_tag:
return bottom[0]
else:
return bottom # chunk as a list
def _all_xmlwords_in(elt, result=None):
if result is None:
result = []
for child in elt:
if child.tag in ('wf', 'punc'):
result.append(child)
else:
_all_xmlwords_in(child, result)
return result
class SemcorSentence(list):
"""
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
"""
def __init__(self, num, items):
self.num = num
list.__init__(self, items)
class SemcorWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
"""
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
"""
:param fileid: The name of the underlying file.
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
:param bracket_sent: If true, include sentence bracketing.
:param pos_tag: Whether to include part-of-speech tags.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
if bracket_sent:
tagspec = '.*/s'
else:
tagspec = '.*/s/(punc|wf)'
self._unit = unit
self._sent = bracket_sent
self._pos_tag = pos_tag
self._sem_tag = sem_tag
self._wordnet = wordnet
XMLCorpusView.__init__(self, fileid, tagspec)
def handle_elt(self, elt, context):
if self._sent:
return self.handle_sent(elt)
else:
return self.handle_word(elt)
def handle_word(self, elt):
return SemcorCorpusReader._word(
elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
)
def handle_sent(self, elt):
sent = []
for child in elt:
if child.tag in ('wf', 'punc'):
itm = self.handle_word(child)
if self._unit == 'word':
sent.extend(itm)
else:
sent.append(itm)
else:
raise ValueError('Unexpected element %s' % child.tag)
return SemcorSentence(elt.attrib['snum'], sent)