You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

33 lines
1000 B
Python

5 years ago
import sys
import codecs
import nltk
5 years ago
import json
5 years ago
from nltk.corpus import stopwords
5 years ago
from nltk import sent_tokenize, word_tokenize, pos_tag
5 years ago
5 years ago
5 years ago
#read stop words from a file (one stopword per line, UTF-8)
stopwords_file = './stopwords.txt'
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())
5 years ago
#open the txt file, read, and tokenize
5 years ago
file = open('faceapp.txt','r')
raw = file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)
# Remove single-character tokens (mostly punctuation)
tokens = [word for word in tokens if len(word) > 1]
# Remove numbers
tokens = [word for word in tokens if not word.isnumeric()]
# Lowercase all words (default_stopwords are lowercase too)
tokens = [word.lower() for word in tokens]
5 years ago
# pos_tag = [word_tokenize(sent) for sent in sent_tokenize(raw)]
pos_tag = [pos_tag(word_tokenize(sent))for sent in sent_tokenize(raw)]
5 years ago
print(pos_tag)
with open ('colonial-glossary.json', 'w') as json_file:
json.dump(pos_tag, json_file)