more updates

master
bootje 5 years ago
parent 2b81f12b04
commit 2f6003b4ef

@ -1,9 +1,11 @@
import sys
import codecs
import nltk
import json
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
#read stop words from a file (one stopword per line, UTF-8)
stopwords_file = './stopwords.txt'
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())
@ -24,4 +26,7 @@ tokens = [word.lower() for word in tokens]
# pos_tag = [word_tokenize(sent) for sent in sent_tokenize(raw)]
pos_tag = [pos_tag(word_tokenize(sent))for sent in sent_tokenize(raw)]
print(pos_tag)
print(pos_tag)
with open ('colonial-glossary.json', 'w') as json_file:
json.dump(pos_tag, json_file)

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save