You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
382 lines
12 KiB
Python
382 lines
12 KiB
Python
# Natural Language Toolkit:
|
|
#
|
|
# Copyright (C) 2001-2020 NLTK Project
|
|
# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
from nltk.corpus.reader.api import *
|
|
from nltk.corpus.reader.xmldocs import XMLCorpusReader
|
|
|
|
|
|
PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
|
|
SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
|
|
|
|
TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
|
|
WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
|
|
|
|
TYPE = re.compile(r'type="(.*?)"')
|
|
ANA = re.compile(r'ana="(.*?)"')
|
|
|
|
TEXTID = re.compile(r'text id="(.*?)"')
|
|
|
|
|
|
class TEICorpusView(StreamBackedCorpusView):
|
|
def __init__(
|
|
self,
|
|
corpus_file,
|
|
tagged,
|
|
group_by_sent,
|
|
group_by_para,
|
|
tagset=None,
|
|
head_len=0,
|
|
textids=None,
|
|
):
|
|
|
|
self._tagged = tagged
|
|
self._textids = textids
|
|
|
|
self._group_by_sent = group_by_sent
|
|
self._group_by_para = group_by_para
|
|
# WARNING -- skip header
|
|
StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
|
|
|
|
_pagesize = 4096
|
|
|
|
def read_block(self, stream):
|
|
block = stream.readlines(self._pagesize)
|
|
block = concat(block)
|
|
while (block.count("<text id") > block.count("</text>")) or block.count(
|
|
"<text id"
|
|
) == 0:
|
|
tmp = stream.readline()
|
|
if len(tmp) <= 0:
|
|
break
|
|
block += tmp
|
|
|
|
block = block.replace("\n", "")
|
|
|
|
textids = TEXTID.findall(block)
|
|
if self._textids:
|
|
for tid in textids:
|
|
if tid not in self._textids:
|
|
beg = block.find(tid) - 1
|
|
end = block[beg:].find("</text>") + len("</text>")
|
|
block = block[:beg] + block[beg + end :]
|
|
|
|
output = []
|
|
for para_str in PARA.findall(block):
|
|
para = []
|
|
for sent_str in SENT.findall(para_str):
|
|
if not self._tagged:
|
|
sent = WORD.findall(sent_str)
|
|
else:
|
|
sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
|
|
if self._group_by_sent:
|
|
para.append(sent)
|
|
else:
|
|
para.extend(sent)
|
|
if self._group_by_para:
|
|
output.append(para)
|
|
else:
|
|
output.extend(para)
|
|
return output
|
|
|
|
def _parse_tag(self, tag_word_tuple):
|
|
(tag, word) = tag_word_tuple
|
|
if tag.startswith("w"):
|
|
tag = ANA.search(tag).group(1)
|
|
else: # tag.startswith('c')
|
|
tag = TYPE.search(tag).group(1)
|
|
return word, tag
|
|
|
|
|
|
class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
|
|
head_len = 2770
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
if "textid_file" in kwargs:
|
|
self._textids = kwargs["textid_file"]
|
|
else:
|
|
self._textids = None
|
|
|
|
XMLCorpusReader.__init__(self, *args)
|
|
CategorizedCorpusReader.__init__(self, kwargs)
|
|
|
|
self._init_textids()
|
|
|
|
def _init_textids(self):
|
|
self._f2t = defaultdict(list)
|
|
self._t2f = defaultdict(list)
|
|
if self._textids is not None:
|
|
with open(self._textids) as fp:
|
|
for line in fp:
|
|
line = line.strip()
|
|
file_id, text_ids = line.split(" ", 1)
|
|
if file_id not in self.fileids():
|
|
raise ValueError(
|
|
"In text_id mapping file %s: %s not found"
|
|
% (self._textids, file_id)
|
|
)
|
|
for text_id in text_ids.split(self._delimiter):
|
|
self._add_textids(file_id, text_id)
|
|
|
|
def _add_textids(self, file_id, text_id):
|
|
self._f2t[file_id].append(text_id)
|
|
self._t2f[text_id].append(file_id)
|
|
|
|
def _resolve(self, fileids, categories, textids=None):
|
|
tmp = None
|
|
if (
|
|
len(list(
|
|
filter(
|
|
lambda accessor: accessor is None, (fileids, categories, textids)
|
|
)
|
|
))
|
|
!= 1
|
|
):
|
|
|
|
raise ValueError(
|
|
"Specify exactly one of: fileids, " "categories or textids"
|
|
)
|
|
|
|
if fileids is not None:
|
|
return fileids, None
|
|
|
|
if categories is not None:
|
|
return self.fileids(categories), None
|
|
|
|
if textids is not None:
|
|
if isinstance(textids, str):
|
|
textids = [textids]
|
|
files = sum((self._t2f[t] for t in textids), [])
|
|
tdict = dict()
|
|
for f in files:
|
|
tdict[f] = set(self._f2t[f]) & set(textids)
|
|
return files, tdict
|
|
|
|
def decode_tag(self, tag):
|
|
# to be implemented
|
|
return tag
|
|
|
|
def textids(self, fileids=None, categories=None):
|
|
"""
|
|
In the pl196x corpus each category is stored in single
|
|
file and thus both methods provide identical functionality. In order
|
|
to accommodate finer granularity, a non-standard textids() method was
|
|
implemented. All the main functions can be supplied with a list
|
|
of required chunks---giving much more control to the user.
|
|
"""
|
|
fileids, _ = self._resolve(fileids, categories)
|
|
if fileids is None:
|
|
return sorted(self._t2f)
|
|
|
|
if isinstance(fileids, str):
|
|
fileids = [fileids]
|
|
return sorted(sum((self._f2t[d] for d in fileids), []))
|
|
|
|
def words(self, fileids=None, categories=None, textids=None):
|
|
fileids, textids = self._resolve(fileids, categories, textids)
|
|
if fileids is None:
|
|
fileids = self._fileids
|
|
elif isinstance(fileids, str):
|
|
fileids = [fileids]
|
|
|
|
if textids:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid),
|
|
False,
|
|
False,
|
|
False,
|
|
head_len=self.head_len,
|
|
textids=textids[fileid],
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
else:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid),
|
|
False,
|
|
False,
|
|
False,
|
|
head_len=self.head_len,
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
|
|
def sents(self, fileids=None, categories=None, textids=None):
|
|
fileids, textids = self._resolve(fileids, categories, textids)
|
|
if fileids is None:
|
|
fileids = self._fileids
|
|
elif isinstance(fileids, str):
|
|
fileids = [fileids]
|
|
|
|
if textids:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid),
|
|
False,
|
|
True,
|
|
False,
|
|
head_len=self.head_len,
|
|
textids=textids[fileid],
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
else:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid), False, True, False, head_len=self.head_len
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
|
|
def paras(self, fileids=None, categories=None, textids=None):
|
|
fileids, textids = self._resolve(fileids, categories, textids)
|
|
if fileids is None:
|
|
fileids = self._fileids
|
|
elif isinstance(fileids, str):
|
|
fileids = [fileids]
|
|
|
|
if textids:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid),
|
|
False,
|
|
True,
|
|
True,
|
|
head_len=self.head_len,
|
|
textids=textids[fileid],
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
else:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid), False, True, True, head_len=self.head_len
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
|
|
def tagged_words(self, fileids=None, categories=None, textids=None):
|
|
fileids, textids = self._resolve(fileids, categories, textids)
|
|
if fileids is None:
|
|
fileids = self._fileids
|
|
elif isinstance(fileids, str):
|
|
fileids = [fileids]
|
|
|
|
if textids:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid),
|
|
True,
|
|
False,
|
|
False,
|
|
head_len=self.head_len,
|
|
textids=textids[fileid],
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
else:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid), True, False, False, head_len=self.head_len
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
|
|
def tagged_sents(self, fileids=None, categories=None, textids=None):
|
|
fileids, textids = self._resolve(fileids, categories, textids)
|
|
if fileids is None:
|
|
fileids = self._fileids
|
|
elif isinstance(fileids, str):
|
|
fileids = [fileids]
|
|
|
|
if textids:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid),
|
|
True,
|
|
True,
|
|
False,
|
|
head_len=self.head_len,
|
|
textids=textids[fileid],
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
else:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid), True, True, False, head_len=self.head_len
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
|
|
def tagged_paras(self, fileids=None, categories=None, textids=None):
|
|
fileids, textids = self._resolve(fileids, categories, textids)
|
|
if fileids is None:
|
|
fileids = self._fileids
|
|
elif isinstance(fileids, str):
|
|
fileids = [fileids]
|
|
|
|
if textids:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid),
|
|
True,
|
|
True,
|
|
True,
|
|
head_len=self.head_len,
|
|
textids=textids[fileid],
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
else:
|
|
return concat(
|
|
[
|
|
TEICorpusView(
|
|
self.abspath(fileid), True, True, True, head_len=self.head_len
|
|
)
|
|
for fileid in fileids
|
|
]
|
|
)
|
|
|
|
def xml(self, fileids=None, categories=None):
|
|
fileids, _ = self._resolve(fileids, categories)
|
|
if len(fileids) == 1:
|
|
return XMLCorpusReader.xml(self, fileids[0])
|
|
else:
|
|
raise TypeError("Expected a single file")
|
|
|
|
def raw(self, fileids=None, categories=None):
|
|
fileids, _ = self._resolve(fileids, categories)
|
|
if fileids is None:
|
|
fileids = self._fileids
|
|
elif isinstance(fileids, str):
|
|
fileids = [fileids]
|
|
return concat([self.open(f).read() for f in fileids])
|