# from __future__ import division from nltk import sent_tokenize, word_tokenize, pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords import nltk import codecs import base64 nltk.download('stopwords') with open('treaty_file/russia-estonia.txt', 'r') as russia_file: russia_text = russia_file.read() russia_text_list = russia_text.split("\n\n") t_default_stopwords = set(stopwords.words('english')) t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines()) t_all_stopwords = t_default_stopwords | t_custom_stopwords print('''
') for t_word, t_pos in t_tagged: print('{2}'.format(t_pos.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word)) print('
') print('