# from __future__ import division from nltk import sent_tokenize, word_tokenize, pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords import nltk import codecs import base64 nltk.download('stopwords') # faceapp_file = open('faceapp.txt','r') with open('tos_file/instagram.txt', 'r') as faceapp_file: faceapp_text = faceapp_file.read() faceapp_text_list = faceapp_text.split("\n\n") #tos stopwords tos_default_stopwords = set(stopwords.words('english')) tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines()) tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords # multi-line string HTML print('''
') for word, pos in tagged: print('{2}'.format(pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), word.replace('’', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), word)) print('
') print('