You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

184 lines
6.2 KiB
Python

# Natural Language Toolkit: Corpus Readers
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
NLTK corpus readers. The modules in this package provide functions
that can be used to read corpus fileids in a variety of formats. These
functions can be used to read both the corpus fileids that are
distributed in the NLTK corpus package, and corpus fileids that are part
of external corpora.
Corpus Reader Functions
=======================
Each corpus module defines one or more "corpus reader functions",
which can be used to read documents from that corpus. These functions
take an argument, ``item``, which is used to indicate which document
should be read from the corpus:
- If ``item`` is one of the unique identifiers listed in the corpus
module's ``items`` variable, then the corresponding document will
be loaded from the NLTK corpus package.
- If ``item`` is a fileid, then that file will be read.
Additionally, corpus reader functions can be given lists of item
names; in which case, they will return a concatenation of the
corresponding documents.
Corpus reader functions are named based on the type of information
they return. Some common examples, and their return types, are:
- words(): list of str
- sents(): list of (list of str)
- paras(): list of (list of (list of str))
- tagged_words(): list of (str,str) tuple
- tagged_sents(): list of (list of (str,str))
- tagged_paras(): list of (list of (list of (str,str)))
- chunked_sents(): list of (Tree w/ (str,str) leaves)
- parsed_sents(): list of (Tree with str leaves)
- parsed_paras(): list of (list of (Tree with str leaves))
- xml(): A single xml ElementTree
- raw(): unprocessed corpus contents
For example, to read a list of the words in the Brown Corpus, use
``nltk.corpus.brown.words()``:
>>> from nltk.corpus import brown
>>> print(", ".join(brown.words()))
The, Fulton, County, Grand, Jury, said, ...
"""
from nltk.corpus.reader.plaintext import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.tagged import *
from nltk.corpus.reader.cmudict import *
from nltk.corpus.reader.conll import *
from nltk.corpus.reader.chunked import *
from nltk.corpus.reader.wordlist import *
from nltk.corpus.reader.xmldocs import *
from nltk.corpus.reader.ppattach import *
from nltk.corpus.reader.senseval import *
from nltk.corpus.reader.ieer import *
from nltk.corpus.reader.sinica_treebank import *
from nltk.corpus.reader.bracket_parse import *
from nltk.corpus.reader.indian import *
from nltk.corpus.reader.toolbox import *
from nltk.corpus.reader.timit import *
from nltk.corpus.reader.ycoe import *
from nltk.corpus.reader.rte import *
from nltk.corpus.reader.string_category import *
from nltk.corpus.reader.propbank import *
from nltk.corpus.reader.verbnet import *
from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.nps_chat import *
from nltk.corpus.reader.wordnet import *
from nltk.corpus.reader.switchboard import *
from nltk.corpus.reader.dependency import *
from nltk.corpus.reader.nombank import *
from nltk.corpus.reader.ipipan import *
from nltk.corpus.reader.pl196x import *
from nltk.corpus.reader.knbc import *
from nltk.corpus.reader.chasen import *
from nltk.corpus.reader.childes import *
from nltk.corpus.reader.aligned import *
from nltk.corpus.reader.lin import *
from nltk.corpus.reader.semcor import *
from nltk.corpus.reader.framenet import *
from nltk.corpus.reader.udhr import *
from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.sentiwordnet import *
from nltk.corpus.reader.twitter import *
from nltk.corpus.reader.nkjp import *
from nltk.corpus.reader.crubadan import *
from nltk.corpus.reader.mte import *
from nltk.corpus.reader.reviews import *
from nltk.corpus.reader.opinion_lexicon import *
from nltk.corpus.reader.pros_cons import *
from nltk.corpus.reader.categorized_sents import *
from nltk.corpus.reader.comparative_sents import *
from nltk.corpus.reader.panlex_lite import *
from nltk.corpus.reader.panlex_swadesh import *
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree:
from nltk.corpus.reader import bracket_parse
__all__ = [
'CorpusReader',
'CategorizedCorpusReader',
'PlaintextCorpusReader',
'find_corpus_fileids',
'TaggedCorpusReader',
'CMUDictCorpusReader',
'ConllChunkCorpusReader',
'WordListCorpusReader',
'PPAttachmentCorpusReader',
'SensevalCorpusReader',
'IEERCorpusReader',
'ChunkedCorpusReader',
'SinicaTreebankCorpusReader',
'BracketParseCorpusReader',
'IndianCorpusReader',
'ToolboxCorpusReader',
'TimitCorpusReader',
'YCOECorpusReader',
'MacMorphoCorpusReader',
'SyntaxCorpusReader',
'AlpinoCorpusReader',
'RTECorpusReader',
'StringCategoryCorpusReader',
'EuroparlCorpusReader',
'CategorizedBracketParseCorpusReader',
'CategorizedTaggedCorpusReader',
'CategorizedPlaintextCorpusReader',
'PortugueseCategorizedPlaintextCorpusReader',
'tagged_treebank_para_block_reader',
'PropbankCorpusReader',
'VerbnetCorpusReader',
'BNCCorpusReader',
'ConllCorpusReader',
'XMLCorpusReader',
'NPSChatCorpusReader',
'SwadeshCorpusReader',
'WordNetCorpusReader',
'WordNetICCorpusReader',
'SwitchboardCorpusReader',
'DependencyCorpusReader',
'NombankCorpusReader',
'IPIPANCorpusReader',
'Pl196xCorpusReader',
'TEICorpusView',
'KNBCorpusReader',
'ChasenCorpusReader',
'CHILDESCorpusReader',
'AlignedCorpusReader',
'TimitTaggedCorpusReader',
'LinThesaurusCorpusReader',
'SemcorCorpusReader',
'FramenetCorpusReader',
'UdhrCorpusReader',
'BNCCorpusReader',
'SentiWordNetCorpusReader',
'SentiSynset',
'TwitterCorpusReader',
'NKJPCorpusReader',
'CrubadanCorpusReader',
'MTECorpusReader',
'ReviewsCorpusReader',
'OpinionLexiconCorpusReader',
'ProsConsCorpusReader',
'CategorizedSentencesCorpusReader',
'ComparativeSentencesCorpusReader',
'PanLexLiteCorpusReader',
'NonbreakingPrefixesCorpusReader',
'UnicharsCorpusReader',
'MWAPPDBCorpusReader',
'PanlexSwadeshCorpusReader',
]