bo-graduation/nltk-book/NLTK_V3.py

import sys
import codecs
import nltk
import json
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag


#read stop words from a file (one stopword per line, UTF-8)
stopwords_file = './stopwords.txt'
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())


#open the txt file, read, and tokenize 
file = open('faceapp.txt','r')
raw = file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)

# Remove single-character tokens (mostly punctuation)
tokens = [word for word in tokens if len(word) > 1]
# Remove numbers
tokens = [word for word in tokens if not word.isnumeric()]
# Lowercase all words (default_stopwords are lowercase too)
tokens = [word.lower() for word in tokens]

# pos_tag = [word_tokenize(sent) for sent in sent_tokenize(raw)]
pos_tag = [pos_tag(word_tokenize(sent))for sent in sent_tokenize(raw)]
print(pos_tag)

with open ('colonial-glossary.json', 'w') as json_file:
	json.dump(pos_tag, json_file)
second script 5 years ago			`import sys`
			`import codecs`
			`import nltk`
more updates 5 years ago			`import json`
second script 5 years ago			`from nltk.corpus import stopwords`
thrid updates 5 years ago			`from nltk import sent_tokenize, word_tokenize, pos_tag`
second script 5 years ago
more updates 5 years ago
second script 5 years ago			`#read stop words from a file (one stopword per line, UTF-8)`
			`stopwords_file = './stopwords.txt'`
			`custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())`


thrid updates 5 years ago			`#open the txt file, read, and tokenize`
second script 5 years ago			`file = open('faceapp.txt','r')`
			`raw = file.read()`
			`tokens = nltk.word_tokenize(raw)`
			`faceapp = nltk.Text(tokens)`

			`# Remove single-character tokens (mostly punctuation)`
			`tokens = [word for word in tokens if len(word) > 1]`
			`# Remove numbers`
			`tokens = [word for word in tokens if not word.isnumeric()]`
			`# Lowercase all words (default_stopwords are lowercase too)`
			`tokens = [word.lower() for word in tokens]`

thrid updates 5 years ago			`# pos_tag = [word_tokenize(sent) for sent in sent_tokenize(raw)]`
			`pos_tag = [pos_tag(word_tokenize(sent))for sent in sent_tokenize(raw)]`
more updates 5 years ago			`print(pos_tag)`

			`with open ('colonial-glossary.json', 'w') as json_file:`
			`json.dump(pos_tag, json_file)`