import sys import codecs import nltk from nltk.corpus import stopwords # NLTK's default English stopwords default_stopwords = set(nltk.corpus.stopwords.words('english')) #read stop words from a file (one stopword per line, UTF-8) stopwords_file = './stopwords.txt' custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines()) all_stopwords = default_stopwords | custom_stopwords file = open('faceapp.txt','r') raw = file.read() tokens = nltk.word_tokenize(raw) faceapp = nltk.Text(tokens) faceapp.concordance('services') # Remove single-character tokens (mostly punctuation) tokens = [word for word in tokens if len(word) > 1] # Remove numbers tokens = [word for word in tokens if not word.isnumeric()] # Lowercase all words (default_stopwords are lowercase too) tokens = [word.lower() for word in tokens] # Remove stopwords tokens = [word for word in tokens if word not in all_stopwords] # Calculate frequency distribution fdist = nltk.FreqDist(tokens) # Output top 50 words for word, frequency in fdist.most_common(10): print(u'{};{}'.format(word, frequency))