bo-graduation/nltk-book/200323_NLTK_V2.py

import sys
import codecs
import nltk
from nltk.corpus import stopwords

# NLTK's default English stopwords
default_stopwords = set(nltk.corpus.stopwords.words('english'))

#read stop words from a file (one stopword per line, UTF-8)
stopwords_file = './stopwords.txt'
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())

all_stopwords = default_stopwords | custom_stopwords

file = open('faceapp.txt','r')
raw = file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)
faceapp.concordance('services')

# Remove single-character tokens (mostly punctuation)
tokens = [word for word in tokens if len(word) > 1]

# Remove numbers
tokens = [word for word in tokens if not word.isnumeric()]

# Lowercase all words (default_stopwords are lowercase too)
tokens = [word.lower() for word in tokens]

# Remove stopwords
tokens = [word for word in tokens if word not in all_stopwords]


# Calculate frequency distribution
fdist = nltk.FreqDist(tokens)

# Output top 50 words
for word, frequency in fdist.most_common(10):
    print(u'{};{}'.format(word, frequency))