import sys import codecs import nltk import json from nltk.corpus import stopwords from nltk import sent_tokenize, word_tokenize, pos_tag #read stop words from a file (one stopword per line, UTF-8) stopwords_file = './stopwords.txt' custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines()) #open the txt file, read, and tokenize file = open('faceapp.txt','r') raw = file.read() tokens = nltk.word_tokenize(raw) faceapp = nltk.Text(tokens) # Remove single-character tokens (mostly punctuation) tokens = [word for word in tokens if len(word) > 1] # Remove numbers tokens = [word for word in tokens if not word.isnumeric()] # Lowercase all words (default_stopwords are lowercase too) tokens = [word.lower() for word in tokens] # pos_tag = [word_tokenize(sent) for sent in sent_tokenize(raw)] pos_tag = [pos_tag(word_tokenize(sent))for sent in sent_tokenize(raw)] print(pos_tag) with open ('colonial-glossary.json', 'w') as json_file: json.dump(pos_tag, json_file)