You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
215 lines
3.6 KiB
Python
215 lines
3.6 KiB
Python
# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Steven Bird <stevenbird1@gmail.com>
|
|
#
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
from __future__ import print_function
|
|
|
|
from nltk.corpus import (
|
|
gutenberg,
|
|
genesis,
|
|
inaugural,
|
|
nps_chat,
|
|
webtext,
|
|
treebank,
|
|
wordnet,
|
|
)
|
|
from nltk.text import Text
|
|
from nltk.probability import FreqDist
|
|
from nltk.util import bigrams
|
|
|
|
print("*** Introductory Examples for the NLTK Book ***")
|
|
print("Loading text1, ..., text9 and sent1, ..., sent9")
|
|
print("Type the name of the text or sentence to view it.")
|
|
print("Type: 'texts()' or 'sents()' to list the materials.")
|
|
|
|
text1 = Text(gutenberg.words('melville-moby_dick.txt'))
|
|
print("text1:", text1.name)
|
|
|
|
text2 = Text(gutenberg.words('austen-sense.txt'))
|
|
print("text2:", text2.name)
|
|
|
|
text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
|
|
print("text3:", text3.name)
|
|
|
|
text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
|
|
print("text4:", text4.name)
|
|
|
|
text5 = Text(nps_chat.words(), name="Chat Corpus")
|
|
print("text5:", text5.name)
|
|
|
|
text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
|
|
print("text6:", text6.name)
|
|
|
|
text7 = Text(treebank.words(), name="Wall Street Journal")
|
|
print("text7:", text7.name)
|
|
|
|
text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
|
|
print("text8:", text8.name)
|
|
|
|
text9 = Text(gutenberg.words('chesterton-thursday.txt'))
|
|
print("text9:", text9.name)
|
|
|
|
|
|
def texts():
|
|
print("text1:", text1.name)
|
|
print("text2:", text2.name)
|
|
print("text3:", text3.name)
|
|
print("text4:", text4.name)
|
|
print("text5:", text5.name)
|
|
print("text6:", text6.name)
|
|
print("text7:", text7.name)
|
|
print("text8:", text8.name)
|
|
print("text9:", text9.name)
|
|
|
|
|
|
sent1 = ["Call", "me", "Ishmael", "."]
|
|
sent2 = [
|
|
"The",
|
|
"family",
|
|
"of",
|
|
"Dashwood",
|
|
"had",
|
|
"long",
|
|
"been",
|
|
"settled",
|
|
"in",
|
|
"Sussex",
|
|
".",
|
|
]
|
|
sent3 = [
|
|
"In",
|
|
"the",
|
|
"beginning",
|
|
"God",
|
|
"created",
|
|
"the",
|
|
"heaven",
|
|
"and",
|
|
"the",
|
|
"earth",
|
|
".",
|
|
]
|
|
sent4 = [
|
|
"Fellow",
|
|
"-",
|
|
"Citizens",
|
|
"of",
|
|
"the",
|
|
"Senate",
|
|
"and",
|
|
"of",
|
|
"the",
|
|
"House",
|
|
"of",
|
|
"Representatives",
|
|
":",
|
|
]
|
|
sent5 = [
|
|
"I",
|
|
"have",
|
|
"a",
|
|
"problem",
|
|
"with",
|
|
"people",
|
|
"PMing",
|
|
"me",
|
|
"to",
|
|
"lol",
|
|
"JOIN",
|
|
]
|
|
sent6 = [
|
|
'SCENE',
|
|
'1',
|
|
':',
|
|
'[',
|
|
'wind',
|
|
']',
|
|
'[',
|
|
'clop',
|
|
'clop',
|
|
'clop',
|
|
']',
|
|
'KING',
|
|
'ARTHUR',
|
|
':',
|
|
'Whoa',
|
|
'there',
|
|
'!',
|
|
]
|
|
sent7 = [
|
|
"Pierre",
|
|
"Vinken",
|
|
",",
|
|
"61",
|
|
"years",
|
|
"old",
|
|
",",
|
|
"will",
|
|
"join",
|
|
"the",
|
|
"board",
|
|
"as",
|
|
"a",
|
|
"nonexecutive",
|
|
"director",
|
|
"Nov.",
|
|
"29",
|
|
".",
|
|
]
|
|
sent8 = [
|
|
'25',
|
|
'SEXY',
|
|
'MALE',
|
|
',',
|
|
'seeks',
|
|
'attrac',
|
|
'older',
|
|
'single',
|
|
'lady',
|
|
',',
|
|
'for',
|
|
'discreet',
|
|
'encounters',
|
|
'.',
|
|
]
|
|
sent9 = [
|
|
"THE",
|
|
"suburb",
|
|
"of",
|
|
"Saffron",
|
|
"Park",
|
|
"lay",
|
|
"on",
|
|
"the",
|
|
"sunset",
|
|
"side",
|
|
"of",
|
|
"London",
|
|
",",
|
|
"as",
|
|
"red",
|
|
"and",
|
|
"ragged",
|
|
"as",
|
|
"a",
|
|
"cloud",
|
|
"of",
|
|
"sunset",
|
|
".",
|
|
]
|
|
|
|
|
|
def sents():
|
|
print("sent1:", " ".join(sent1))
|
|
print("sent2:", " ".join(sent2))
|
|
print("sent3:", " ".join(sent3))
|
|
print("sent4:", " ".join(sent4))
|
|
print("sent5:", " ".join(sent5))
|
|
print("sent6:", " ".join(sent6))
|
|
print("sent7:", " ".join(sent7))
|
|
print("sent8:", " ".join(sent8))
|
|
print("sent9:", " ".join(sent9))
|