You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

214 lines
3.6 KiB
Python

# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus import (
gutenberg,
genesis,
inaugural,
nps_chat,
webtext,
treebank,
wordnet,
)
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")
text1 = Text(gutenberg.words("melville-moby_dick.txt"))
print("text1:", text1.name)
text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)
text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)
text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)
text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)
text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)
text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)
text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
print("text8:", text8.name)
text9 = Text(gutenberg.words("chesterton-thursday.txt"))
print("text9:", text9.name)
def texts():
print("text1:", text1.name)
print("text2:", text2.name)
print("text3:", text3.name)
print("text4:", text4.name)
print("text5:", text5.name)
print("text6:", text6.name)
print("text7:", text7.name)
print("text8:", text8.name)
print("text9:", text9.name)
sent1 = ["Call", "me", "Ishmael", "."]
sent2 = [
"The",
"family",
"of",
"Dashwood",
"had",
"long",
"been",
"settled",
"in",
"Sussex",
".",
]
sent3 = [
"In",
"the",
"beginning",
"God",
"created",
"the",
"heaven",
"and",
"the",
"earth",
".",
]
sent4 = [
"Fellow",
"-",
"Citizens",
"of",
"the",
"Senate",
"and",
"of",
"the",
"House",
"of",
"Representatives",
":",
]
sent5 = [
"I",
"have",
"a",
"problem",
"with",
"people",
"PMing",
"me",
"to",
"lol",
"JOIN",
]
sent6 = [
"SCENE",
"1",
":",
"[",
"wind",
"]",
"[",
"clop",
"clop",
"clop",
"]",
"KING",
"ARTHUR",
":",
"Whoa",
"there",
"!",
]
sent7 = [
"Pierre",
"Vinken",
",",
"61",
"years",
"old",
",",
"will",
"join",
"the",
"board",
"as",
"a",
"nonexecutive",
"director",
"Nov.",
"29",
".",
]
sent8 = [
"25",
"SEXY",
"MALE",
",",
"seeks",
"attrac",
"older",
"single",
"lady",
",",
"for",
"discreet",
"encounters",
".",
]
sent9 = [
"THE",
"suburb",
"of",
"Saffron",
"Park",
"lay",
"on",
"the",
"sunset",
"side",
"of",
"London",
",",
"as",
"red",
"and",
"ragged",
"as",
"a",
"cloud",
"of",
"sunset",
".",
]
def sents():
print("sent1:", " ".join(sent1))
print("sent2:", " ".join(sent2))
print("sent3:", " ".join(sent3))
print("sent4:", " ".join(sent4))
print("sent5:", " ".join(sent5))
print("sent6:", " ".join(sent6))
print("sent7:", " ".join(sent7))
print("sent8:", " ".join(sent8))
print("sent9:", " ".join(sent9))