You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

215 lines
3.6 KiB
Python

# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function
from nltk.corpus import (
gutenberg,
genesis,
inaugural,
nps_chat,
webtext,
treebank,
wordnet,
)
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")
text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)
text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)
text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
print("text3:", text3.name)
text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)
text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)
text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)
text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)
text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
print("text8:", text8.name)
text9 = Text(gutenberg.words('chesterton-thursday.txt'))
print("text9:", text9.name)
def texts():
print("text1:", text1.name)
print("text2:", text2.name)
print("text3:", text3.name)
print("text4:", text4.name)
print("text5:", text5.name)
print("text6:", text6.name)
print("text7:", text7.name)
print("text8:", text8.name)
print("text9:", text9.name)
sent1 = ["Call", "me", "Ishmael", "."]
sent2 = [
"The",
"family",
"of",
"Dashwood",
"had",
"long",
"been",
"settled",
"in",
"Sussex",
".",
]
sent3 = [
"In",
"the",
"beginning",
"God",
"created",
"the",
"heaven",
"and",
"the",
"earth",
".",
]
sent4 = [
"Fellow",
"-",
"Citizens",
"of",
"the",
"Senate",
"and",
"of",
"the",
"House",
"of",
"Representatives",
":",
]
sent5 = [
"I",
"have",
"a",
"problem",
"with",
"people",
"PMing",
"me",
"to",
"lol",
"JOIN",
]
sent6 = [
'SCENE',
'1',
':',
'[',
'wind',
']',
'[',
'clop',
'clop',
'clop',
']',
'KING',
'ARTHUR',
':',
'Whoa',
'there',
'!',
]
sent7 = [
"Pierre",
"Vinken",
",",
"61",
"years",
"old",
",",
"will",
"join",
"the",
"board",
"as",
"a",
"nonexecutive",
"director",
"Nov.",
"29",
".",
]
sent8 = [
'25',
'SEXY',
'MALE',
',',
'seeks',
'attrac',
'older',
'single',
'lady',
',',
'for',
'discreet',
'encounters',
'.',
]
sent9 = [
"THE",
"suburb",
"of",
"Saffron",
"Park",
"lay",
"on",
"the",
"sunset",
"side",
"of",
"London",
",",
"as",
"red",
"and",
"ragged",
"as",
"a",
"cloud",
"of",
"sunset",
".",
]
def sents():
print("sent1:", " ".join(sent1))
print("sent2:", " ".join(sent2))
print("sent3:", " ".join(sent3))
print("sent4:", " ".join(sent4))
print("sent5:", " ".join(sent5))
print("sent6:", " ".join(sent6))
print("sent7:", " ".join(sent7))
print("sent8:", " ".join(sent8))
print("sent9:", " ".join(sent9))