You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
169 lines
4.7 KiB
Python
169 lines
4.7 KiB
Python
#
|
|
# Copyright (C) 2001-2020 NLTK Project
|
|
# Author: Masato Hagiwara <hagisan@gmail.com>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
|
|
|
|
import sys
|
|
|
|
from nltk.corpus.reader import util
|
|
|
|
from nltk.corpus.reader.util import *
|
|
from nltk.corpus.reader.api import *
|
|
|
|
|
|
class ChasenCorpusReader(CorpusReader):
|
|
def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
|
|
self._sent_splitter = sent_splitter
|
|
CorpusReader.__init__(self, root, fileids, encoding)
|
|
|
|
def raw(self, fileids=None):
|
|
if fileids is None:
|
|
fileids = self._fileids
|
|
elif isinstance(fileids, str):
|
|
fileids = [fileids]
|
|
return concat([self.open(f).read() for f in fileids])
|
|
|
|
def words(self, fileids=None):
|
|
return concat(
|
|
[
|
|
ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
|
|
for (fileid, enc) in self.abspaths(fileids, True)
|
|
]
|
|
)
|
|
|
|
def tagged_words(self, fileids=None):
|
|
return concat(
|
|
[
|
|
ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
|
|
for (fileid, enc) in self.abspaths(fileids, True)
|
|
]
|
|
)
|
|
|
|
def sents(self, fileids=None):
|
|
return concat(
|
|
[
|
|
ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
|
|
for (fileid, enc) in self.abspaths(fileids, True)
|
|
]
|
|
)
|
|
|
|
def tagged_sents(self, fileids=None):
|
|
return concat(
|
|
[
|
|
ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
|
|
for (fileid, enc) in self.abspaths(fileids, True)
|
|
]
|
|
)
|
|
|
|
def paras(self, fileids=None):
|
|
return concat(
|
|
[
|
|
ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
|
|
for (fileid, enc) in self.abspaths(fileids, True)
|
|
]
|
|
)
|
|
|
|
def tagged_paras(self, fileids=None):
|
|
return concat(
|
|
[
|
|
ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
|
|
for (fileid, enc) in self.abspaths(fileids, True)
|
|
]
|
|
)
|
|
|
|
|
|
class ChasenCorpusView(StreamBackedCorpusView):
|
|
"""
|
|
A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
|
|
but this'll use fixed sets of word and sentence tokenizer.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
corpus_file,
|
|
encoding,
|
|
tagged,
|
|
group_by_sent,
|
|
group_by_para,
|
|
sent_splitter=None,
|
|
):
|
|
self._tagged = tagged
|
|
self._group_by_sent = group_by_sent
|
|
self._group_by_para = group_by_para
|
|
self._sent_splitter = sent_splitter
|
|
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
|
|
|
def read_block(self, stream):
|
|
"""Reads one paragraph at a time."""
|
|
block = []
|
|
for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
|
|
|
|
para = []
|
|
|
|
sent = []
|
|
for line in para_str.splitlines():
|
|
|
|
_eos = line.strip() == "EOS"
|
|
_cells = line.split("\t")
|
|
w = (_cells[0], "\t".join(_cells[1:]))
|
|
if not _eos:
|
|
sent.append(w)
|
|
|
|
if _eos or (self._sent_splitter and self._sent_splitter(w)):
|
|
if not self._tagged:
|
|
sent = [w for (w, t) in sent]
|
|
if self._group_by_sent:
|
|
para.append(sent)
|
|
else:
|
|
para.extend(sent)
|
|
sent = []
|
|
|
|
if len(sent) > 0:
|
|
if not self._tagged:
|
|
sent = [w for (w, t) in sent]
|
|
|
|
if self._group_by_sent:
|
|
para.append(sent)
|
|
else:
|
|
para.extend(sent)
|
|
|
|
if self._group_by_para:
|
|
block.append(para)
|
|
else:
|
|
block.extend(para)
|
|
|
|
return block
|
|
|
|
|
|
def demo():
|
|
|
|
import nltk
|
|
from nltk.corpus.util import LazyCorpusLoader
|
|
|
|
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
|
|
print("/".join(jeita.words()[22100:22140]))
|
|
|
|
print(
|
|
"\nEOS\n".join(
|
|
"\n".join("%s/%s" % (w[0], w[1].split("\t")[2]) for w in sent)
|
|
for sent in jeita.tagged_sents()[2170:2173]
|
|
)
|
|
)
|
|
|
|
|
|
def test():
|
|
|
|
from nltk.corpus.util import LazyCorpusLoader
|
|
|
|
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
|
|
|
|
assert isinstance(jeita.tagged_words()[0][1], str)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
demo()
|
|
test()
|