You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

494 lines
14 KiB
Python

4 years ago
# Natural Language Toolkit: Corpus Readers
#
# Copyright (C) 2001-2020 NLTK Project
4 years ago
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# TODO this docstring isn't up-to-date!
"""
NLTK corpus readers. The modules in this package provide functions
that can be used to read corpus files in a variety of formats. These
functions can be used to read both the corpus files that are
distributed in the NLTK corpus package, and corpus files that are part
of external corpora.
Available Corpora
=================
Please see http://www.nltk.org/nltk_data/ for a complete list.
Install corpora using nltk.download().
Corpus Reader Functions
=======================
Each corpus module defines one or more "corpus reader functions",
which can be used to read documents from that corpus. These functions
take an argument, ``item``, which is used to indicate which document
should be read from the corpus:
- If ``item`` is one of the unique identifiers listed in the corpus
module's ``items`` variable, then the corresponding document will
be loaded from the NLTK corpus package.
- If ``item`` is a filename, then that file will be read.
Additionally, corpus reader functions can be given lists of item
names; in which case, they will return a concatenation of the
corresponding documents.
Corpus reader functions are named based on the type of information
they return. Some common examples, and their return types, are:
- words(): list of str
- sents(): list of (list of str)
- paras(): list of (list of (list of str))
- tagged_words(): list of (str,str) tuple
- tagged_sents(): list of (list of (str,str))
- tagged_paras(): list of (list of (list of (str,str)))
- chunked_sents(): list of (Tree w/ (str,str) leaves)
- parsed_sents(): list of (Tree with str leaves)
- parsed_paras(): list of (list of (Tree with str leaves))
- xml(): A single xml ElementTree
- raw(): unprocessed corpus contents
For example, to read a list of the words in the Brown Corpus, use
``nltk.corpus.brown.words()``:
>>> from nltk.corpus import brown
>>> print(", ".join(brown.words()))
The, Fulton, County, Grand, Jury, said, ...
"""
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *
abc = LazyCorpusLoader(
"abc",
4 years ago
PlaintextCorpusReader,
r"(?!\.).*\.txt",
encoding=[("science", "latin_1"), ("rural", "utf8")],
4 years ago
)
alpino = LazyCorpusLoader("alpino", AlpinoCorpusReader, tagset="alpino")
4 years ago
brown = LazyCorpusLoader(
"brown",
4 years ago
CategorizedTaggedCorpusReader,
r"c[a-z]\d\d",
cat_file="cats.txt",
tagset="brown",
4 years ago
encoding="ascii",
)
cess_cat = LazyCorpusLoader(
"cess_cat",
4 years ago
BracketParseCorpusReader,
r"(?!\.).*\.tbf",
tagset="unknown",
encoding="ISO-8859-15",
4 years ago
)
cess_esp = LazyCorpusLoader(
"cess_esp",
4 years ago
BracketParseCorpusReader,
r"(?!\.).*\.tbf",
tagset="unknown",
encoding="ISO-8859-15",
4 years ago
)
cmudict = LazyCorpusLoader("cmudict", CMUDictCorpusReader, ["cmudict"])
comtrans = LazyCorpusLoader("comtrans", AlignedCorpusReader, r"(?!\.).*\.txt")
4 years ago
comparative_sentences = LazyCorpusLoader(
"comparative_sentences",
4 years ago
ComparativeSentencesCorpusReader,
r"labeledSentences\.txt",
encoding="latin-1",
4 years ago
)
conll2000 = LazyCorpusLoader(
"conll2000",
4 years ago
ConllChunkCorpusReader,
["train.txt", "test.txt"],
("NP", "VP", "PP"),
tagset="wsj",
encoding="ascii",
4 years ago
)
conll2002 = LazyCorpusLoader(
"conll2002",
4 years ago
ConllChunkCorpusReader,
".*\.(test|train).*",
("LOC", "PER", "ORG", "MISC"),
encoding="utf-8",
4 years ago
)
conll2007 = LazyCorpusLoader(
"conll2007",
4 years ago
DependencyCorpusReader,
".*\.(test|train).*",
encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
4 years ago
)
crubadan = LazyCorpusLoader("crubadan", CrubadanCorpusReader, ".*\.txt")
4 years ago
dependency_treebank = LazyCorpusLoader(
"dependency_treebank", DependencyCorpusReader, ".*\.dp", encoding="ascii"
4 years ago
)
floresta = LazyCorpusLoader(
"floresta",
4 years ago
BracketParseCorpusReader,
r"(?!\.).*\.ptb",
"#",
tagset="unknown",
encoding="ISO-8859-15",
4 years ago
)
framenet15 = LazyCorpusLoader(
"framenet_v15",
4 years ago
FramenetCorpusReader,
[
"frRelation.xml",
"frameIndex.xml",
"fulltextIndex.xml",
"luIndex.xml",
"semTypes.xml",
4 years ago
],
)
framenet = LazyCorpusLoader(
"framenet_v17",
4 years ago
FramenetCorpusReader,
[
"frRelation.xml",
"frameIndex.xml",
"fulltextIndex.xml",
"luIndex.xml",
"semTypes.xml",
4 years ago
],
)
gazetteers = LazyCorpusLoader(
"gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
4 years ago
)
genesis = LazyCorpusLoader(
"genesis",
4 years ago
PlaintextCorpusReader,
r"(?!\.).*\.txt",
4 years ago
encoding=[
("finnish|french|german", "latin_1"),
("swedish", "cp865"),
(".*", "utf_8"),
4 years ago
],
)
gutenberg = LazyCorpusLoader(
"gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
4 years ago
)
ieer = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
4 years ago
inaugural = LazyCorpusLoader(
"inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
4 years ago
)
# [XX] This should probably just use TaggedCorpusReader:
indian = LazyCorpusLoader(
"indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
4 years ago
)
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8")
knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp")
lin_thesaurus = LazyCorpusLoader("lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp")
4 years ago
mac_morpho = LazyCorpusLoader(
"mac_morpho",
4 years ago
MacMorphoCorpusReader,
r"(?!\.).*\.txt",
tagset="unknown",
encoding="latin-1",
4 years ago
)
machado = LazyCorpusLoader(
"machado",
4 years ago
PortugueseCategorizedPlaintextCorpusReader,
r"(?!\.).*\.txt",
cat_pattern=r"([a-z]*)/.*",
encoding="latin-1",
4 years ago
)
masc_tagged = LazyCorpusLoader(
"masc_tagged",
4 years ago
CategorizedTaggedCorpusReader,
r"(spoken|written)/.*\.txt",
cat_file="categories.txt",
tagset="wsj",
4 years ago
encoding="utf-8",
sep="_",
)
movie_reviews = LazyCorpusLoader(
"movie_reviews",
4 years ago
CategorizedPlaintextCorpusReader,
r"(?!\.).*\.txt",
cat_pattern=r"(neg|pos)/.*",
encoding="ascii",
4 years ago
)
multext_east = LazyCorpusLoader(
"mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
4 years ago
)
names = LazyCorpusLoader(
"names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
4 years ago
)
nps_chat = LazyCorpusLoader(
"nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
4 years ago
)
opinion_lexicon = LazyCorpusLoader(
"opinion_lexicon",
4 years ago
OpinionLexiconCorpusReader,
r"(\w+)\-words\.txt",
encoding="ISO-8859-2",
4 years ago
)
ppattach = LazyCorpusLoader(
"ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
4 years ago
)
product_reviews_1 = LazyCorpusLoader(
"product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
4 years ago
)
product_reviews_2 = LazyCorpusLoader(
"product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
4 years ago
)
pros_cons = LazyCorpusLoader(
"pros_cons",
4 years ago
ProsConsCorpusReader,
r"Integrated(Cons|Pros)\.txt",
cat_pattern=r"Integrated(Cons|Pros)\.txt",
encoding="ISO-8859-2",
4 years ago
)
ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
"ptb",
4 years ago
CategorizedBracketParseCorpusReader,
r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
cat_file="allcats.txt",
tagset="wsj",
4 years ago
)
qc = LazyCorpusLoader(
"qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
4 years ago
)
reuters = LazyCorpusLoader(
"reuters",
4 years ago
CategorizedPlaintextCorpusReader,
"(training|test).*",
cat_file="cats.txt",
encoding="ISO-8859-2",
4 years ago
)
rte = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
senseval = LazyCorpusLoader("senseval", SensevalCorpusReader, r"(?!\.).*\.pos")
4 years ago
sentence_polarity = LazyCorpusLoader(
"sentence_polarity",
4 years ago
CategorizedSentencesCorpusReader,
r"rt-polarity\.(neg|pos)",
cat_pattern=r"rt-polarity\.(neg|pos)",
encoding="utf-8",
4 years ago
)
sentiwordnet = LazyCorpusLoader(
"sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
4 years ago
)
shakespeare = LazyCorpusLoader("shakespeare", XMLCorpusReader, r"(?!\.).*\.xml")
4 years ago
sinica_treebank = LazyCorpusLoader(
"sinica_treebank",
4 years ago
SinicaTreebankCorpusReader,
["parsed"],
tagset="unknown",
encoding="utf-8",
4 years ago
)
state_union = LazyCorpusLoader(
"state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
4 years ago
)
stopwords = LazyCorpusLoader(
"stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
4 years ago
)
subjectivity = LazyCorpusLoader(
"subjectivity",
4 years ago
CategorizedSentencesCorpusReader,
r"(quote.tok.gt9|plot.tok.gt9)\.5000",
cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
encoding="latin-1",
4 years ago
)
swadesh = LazyCorpusLoader(
"swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
4 years ago
)
swadesh110 = LazyCorpusLoader(
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
)
swadesh207 = LazyCorpusLoader(
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
)
switchboard = LazyCorpusLoader("switchboard", SwitchboardCorpusReader, tagset="wsj")
timit = LazyCorpusLoader("timit", TimitCorpusReader)
4 years ago
timit_tagged = LazyCorpusLoader(
"timit", TimitTaggedCorpusReader, ".+\.tags", tagset="wsj", encoding="ascii"
4 years ago
)
toolbox = LazyCorpusLoader(
"toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
4 years ago
)
treebank = LazyCorpusLoader(
"treebank/combined",
4 years ago
BracketParseCorpusReader,
r"wsj_.*\.mrg",
tagset="wsj",
encoding="ascii",
4 years ago
)
treebank_chunk = LazyCorpusLoader(
"treebank/tagged",
4 years ago
ChunkedCorpusReader,
r"wsj_.*\.pos",
sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
4 years ago
para_block_reader=tagged_treebank_para_block_reader,
tagset="wsj",
encoding="ascii",
4 years ago
)
treebank_raw = LazyCorpusLoader(
"treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
4 years ago
)
twitter_samples = LazyCorpusLoader("twitter_samples", TwitterCorpusReader, ".*\.json")
udhr = LazyCorpusLoader("udhr", UdhrCorpusReader)
udhr2 = LazyCorpusLoader("udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8")
4 years ago
universal_treebanks = LazyCorpusLoader(
"universal_treebanks_v20",
4 years ago
ConllCorpusReader,
r".*\.conll",
4 years ago
columntypes=(
"ignore",
"words",
"ignore",
"ignore",
"pos",
"ignore",
"ignore",
"ignore",
"ignore",
"ignore",
4 years ago
),
)
verbnet = LazyCorpusLoader("verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml")
4 years ago
webtext = LazyCorpusLoader(
"webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
4 years ago
)
wordnet = LazyCorpusLoader(
"wordnet",
4 years ago
WordNetCorpusReader,
LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
4 years ago
)
wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, ".*\.dat")
4 years ago
words = LazyCorpusLoader(
"words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
4 years ago
)
# defined after treebank
propbank = LazyCorpusLoader(
"propbank",
4 years ago
PropbankCorpusReader,
"prop.txt",
"frames/.*\.xml",
"verbs.txt",
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
4 years ago
treebank,
) # Must be defined *after* treebank corpus.
nombank = LazyCorpusLoader(
"nombank.1.0",
4 years ago
NombankCorpusReader,
"nombank.1.0",
"frames/.*\.xml",
"nombank.1.0.words",
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
4 years ago
treebank,
) # Must be defined *after* treebank corpus.
propbank_ptb = LazyCorpusLoader(
"propbank",
4 years ago
PropbankCorpusReader,
"prop.txt",
"frames/.*\.xml",
"verbs.txt",
4 years ago
lambda filename: filename.upper(),
ptb,
) # Must be defined *after* ptb corpus.
nombank_ptb = LazyCorpusLoader(
"nombank.1.0",
4 years ago
NombankCorpusReader,
"nombank.1.0",
"frames/.*\.xml",
"nombank.1.0.words",
4 years ago
lambda filename: filename.upper(),
ptb,
) # Must be defined *after* ptb corpus.
semcor = LazyCorpusLoader(
"semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
4 years ago
) # Must be defined *after* wordnet corpus.
nonbreaking_prefixes = LazyCorpusLoader(
"nonbreaking_prefixes",
4 years ago
NonbreakingPrefixesCorpusReader,
r"(?!README|\.).*",
encoding="utf8",
4 years ago
)
perluniprops = LazyCorpusLoader(
"perluniprops",
4 years ago
UnicharsCorpusReader,
r"(?!README|\.).*",
nltk_data_subdir="misc",
encoding="utf8",
4 years ago
)
# mwa_ppdb = LazyCorpusLoader(
# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
# See https://github.com/nltk/nltk/issues/1579
# and https://github.com/nltk/nltk/issues/1716
#
# pl196x = LazyCorpusLoader(
# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
#
# ipipan = LazyCorpusLoader(
# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
#
# nkjp = LazyCorpusLoader(
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
#
# panlex_lite = LazyCorpusLoader(
# 'panlex_lite', PanLexLiteCorpusReader)
#
# ycoe = LazyCorpusLoader(
# 'ycoe', YCOECorpusReader)
#
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
# hebrew_treebank = LazyCorpusLoader(
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
def demo():
# This is out-of-date:
abc.demo()
brown.demo()
# chat80.demo()
cmudict.demo()
conll2000.demo()
conll2002.demo()
genesis.demo()
gutenberg.demo()
ieer.demo()
inaugural.demo()
indian.demo()
names.demo()
ppattach.demo()
senseval.demo()
shakespeare.demo()
sinica_treebank.demo()
state_union.demo()
stopwords.demo()
timit.demo()
toolbox.demo()
treebank.demo()
udhr.demo()
webtext.demo()
words.demo()
# ycoe.demo()
if __name__ == "__main__":
4 years ago
# demo()
pass
# ** this is for nose **
# unload all corpus after tests
def teardown_module(module=None):
import nltk.corpus
for name in dir(nltk.corpus):
obj = getattr(nltk.corpus, name, None)
if isinstance(obj, CorpusReader) and hasattr(obj, "_unload"):
4 years ago
obj._unload()