You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

259 lines
9.2 KiB
Python

# Natural Language Toolkit: Plaintext Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""Corpus reader for the XML version of the British National Corpus."""
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView, ElementTree
class BNCCorpusReader(XMLCorpusReader):
"""Corpus reader for the XML version of the British National Corpus.
For access to the complete XML data structure, use the ``xml()``
method. For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
You can obtain the full version of the BNC corpus at
http://www.ota.ox.ac.uk/desc/2554
If you extracted the archive to a directory called `BNC`, then you can
instantiate the reader as::
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
"""
def __init__(self, root, fileids, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
def words(self, fileids=None, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
return self._views(fileids, False, None, strip_space, stem)
def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
:param c5: If true, then the tags used will be the more detailed
c5 tags. Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
tag = 'c5' if c5 else 'pos'
return self._views(fileids, False, tag, strip_space, stem)
def sents(self, fileids=None, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
return self._views(fileids, True, None, strip_space, stem)
def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
:param c5: If true, then the tags used will be the more detailed
c5 tags. Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
tag = 'c5' if c5 else 'pos'
return self._views(
fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
)
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
f = BNCWordView if self._lazy else self._words
return concat(
[
f(fileid, sent, tag, strip_space, stem)
for fileid in self.abspaths(fileids)
]
)
def _words(self, fileid, bracket_sent, tag, strip_space, stem):
"""
Helper used to implement the view methods -- returns a list of
words or a list of sentences, optionally tagged.
:param fileid: The name of the underlying file.
:param bracket_sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
result = []
xmldoc = ElementTree.parse(fileid).getroot()
for xmlsent in xmldoc.findall('.//s'):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
word = xmlword.text
if not word:
word = "" # fixes issue 337?
if strip_space or stem:
word = word.strip()
if stem:
word = xmlword.get('hw', word)
if tag == 'c5':
word = (word, xmlword.get('c5'))
elif tag == 'pos':
word = (word, xmlword.get('pos', xmlword.get('c5')))
sent.append(word)
if bracket_sent:
result.append(BNCSentence(xmlsent.attrib['n'], sent))
else:
result.extend(sent)
assert None not in result
return result
def _all_xmlwords_in(elt, result=None):
if result is None:
result = []
for child in elt:
if child.tag in ('c', 'w'):
result.append(child)
else:
_all_xmlwords_in(child, result)
return result
class BNCSentence(list):
"""
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
"""
def __init__(self, num, items):
self.num = num
list.__init__(self, items)
class BNCWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
"""
tags_to_ignore = set(
['pb', 'gap', 'vocal', 'event', 'unclear', 'shift', 'pause', 'align']
)
"""These tags are ignored. For their description refer to the
technical documentation, for example,
http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
"""
def __init__(self, fileid, sent, tag, strip_space, stem):
"""
:param fileid: The name of the underlying file.
:param sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
if sent:
tagspec = '.*/s'
else:
tagspec = '.*/s/(.*/)?(c|w)'
self._sent = sent
self._tag = tag
self._strip_space = strip_space
self._stem = stem
self.title = None #: Title of the document.
self.author = None #: Author of the document.
self.editor = None #: Editor
self.resps = None #: Statement of responsibility
XMLCorpusView.__init__(self, fileid, tagspec)
# Read in a tasty header.
self._open()
self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
self.close()
# Reset tag context.
self._tag_context = {0: ()}
def handle_header(self, elt, context):
# Set up some metadata!
titles = elt.findall('titleStmt/title')
if titles:
self.title = '\n'.join(title.text.strip() for title in titles)
authors = elt.findall('titleStmt/author')
if authors:
self.author = '\n'.join(author.text.strip() for author in authors)
editors = elt.findall('titleStmt/editor')
if editors:
self.editor = '\n'.join(editor.text.strip() for editor in editors)
resps = elt.findall('titleStmt/respStmt')
if resps:
self.resps = '\n\n'.join(
'\n'.join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
)
def handle_elt(self, elt, context):
if self._sent:
return self.handle_sent(elt)
else:
return self.handle_word(elt)
def handle_word(self, elt):
word = elt.text
if not word:
word = "" # fixes issue 337?
if self._strip_space or self._stem:
word = word.strip()
if self._stem:
word = elt.get('hw', word)
if self._tag == 'c5':
word = (word, elt.get('c5'))
elif self._tag == 'pos':
word = (word, elt.get('pos', elt.get('c5')))
return word
def handle_sent(self, elt):
sent = []
for child in elt:
if child.tag in ('mw', 'hi', 'corr', 'trunc'):
sent += [self.handle_word(w) for w in child]
elif child.tag in ('w', 'c'):
sent.append(self.handle_word(child))
elif child.tag not in self.tags_to_ignore:
raise ValueError('Unexpected element %s' % child.tag)
return BNCSentence(elt.attrib['n'], sent)