import os import math import nltk def tfidf(query, document, corpus): # Term Frequency tf_count = 0 for word in document: if query == word: tf_count += 1 tf = tf_count/len(document) # Inverse Document Frequency idf_count = 0 for document in corpus: if query in document: idf_count += 1 idf = math.log(len(corpus)/idf_count) # Term Frequency / Inverse Document Frequency tfidf_value = tf * idf return tf_count, idf_count, tfidf_value # --- # USE THE TF-IDF ALGORITHM directory = './txt/' textfiles = os.listdir(directory) corpus = [] for manifesto in textfiles: text = open(f'{ directory }/{ manifesto }', 'r').read() # open the txt file text = text.lower() # make all lowercase document = nltk.word_tokenize(text) # make a list of words corpus.append(document) # q = 'world' # query # d = corpus[0] # one document # c = corpus # full corpus # tf_count, idf_count, tfidf_value = tfidf(q, d, c) # print('query:', q) # print('TF:', tf_count) # print('IDF:', idf_count) # print('TF-IDF:', tfidf_value) # --- # PRINT OUT ALL TF-IDF VALUES # for document in corpus: # for word in document: # tf_count, idf_count, tfidf_value = tfidf(word, document, corpus) # print('query:', word) # print('TF:', tf_count) # print('IDF:', idf_count) # print('TF-IDF:', tfidf_value) # print('---') # --- # VISUALISE THE TF-IDF, USING ONE OF THE MANIFESTO'S # manifesto = textfiles[8] # pick one manifesto # text = open(f'{ directory }/{ manifesto }', 'r').read() # open the txt file # text = text.lower() # make all lowercase # document = nltk.word_tokenize(text) # make a list of words # html = [] # for word in document: # tf_count, idf_count, tfidf_value = tfidf(word, document, corpus) # element = f'{ word } ' # html.append(element) # # write html to file # out = open('tfidf.html', 'w') # html = "".join(html) # convert the list to a string # out.write(html) # out.close()