|
|
|
import sys
|
|
|
|
import codecs
|
|
|
|
import nltk
|
|
|
|
import json
|
|
|
|
from nltk.corpus import stopwords
|
|
|
|
from nltk import sent_tokenize, word_tokenize, pos_tag
|
|
|
|
|
|
|
|
|
|
|
|
#read stop words from a file (one stopword per line, UTF-8)
|
|
|
|
stopwords_file = './stopwords.txt'
|
|
|
|
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())
|
|
|
|
|
|
|
|
|
|
|
|
#open the txt file, read, and tokenize
|
|
|
|
file = open('faceapp.txt','r')
|
|
|
|
raw = file.read()
|
|
|
|
tokens = nltk.word_tokenize(raw)
|
|
|
|
faceapp = nltk.Text(tokens)
|
|
|
|
|
|
|
|
# Remove single-character tokens (mostly punctuation)
|
|
|
|
tokens = [word for word in tokens if len(word) > 1]
|
|
|
|
# Remove numbers
|
|
|
|
tokens = [word for word in tokens if not word.isnumeric()]
|
|
|
|
# Lowercase all words (default_stopwords are lowercase too)
|
|
|
|
tokens = [word.lower() for word in tokens]
|
|
|
|
|
|
|
|
# pos_tag = [word_tokenize(sent) for sent in sent_tokenize(raw)]
|
|
|
|
pos_tag = [pos_tag(word_tokenize(sent))for sent in sent_tokenize(raw)]
|
|
|
|
print(pos_tag)
|
|
|
|
|
|
|
|
with open ('colonial-glossary.json', 'w') as json_file:
|
|
|
|
json.dump(pos_tag, json_file)
|