prototypes/tf-idf/tfidf.py

import os
import math
import nltk

def tfidf(query, document, corpus):

	# Term Frequency
	tf_count = 0
	for word in document:
		if query == word:
			tf_count += 1
	tf = tf_count/len(document)

	# Inverse Document Frequency
	idf_count = 0
	for document in corpus:
		if query in document:
			idf_count += 1
	idf = math.log(len(corpus)/idf_count)

	# Term Frequency / Inverse Document Frequency
	tfidf_value = tf * idf

	return tf_count, idf_count, tfidf_value

# ---

# USE THE TF-IDF ALGORITHM

directory = './txt/'
textfiles = os.listdir(directory)

corpus = []

for manifesto in textfiles:
	text = open(f'{ directory }/{ manifesto }', 'r').read() # open the txt file
	text = text.lower() # make all lowercase
	document = nltk.word_tokenize(text) # make a list of words
	corpus.append(document)

# q = 'world' # query
# d = corpus[0] # one document
# c = corpus # full corpus

# tf_count, idf_count, tfidf_value = tfidf(q, d, c)

# print('query:', q)
# print('TF:', tf_count)
# print('IDF:', idf_count)
# print('TF-IDF:', tfidf_value)

# ---

# PRINT OUT ALL TF-IDF VALUES

# for document in corpus:
# 	for word in document:
# 		tf_count, idf_count, tfidf_value = tfidf(word, document, corpus)

		# print('query:', word)
		# print('TF:', tf_count)
		# print('IDF:', idf_count)
		# print('TF-IDF:', tfidf_value)
		# print('---')

# ---

# VISUALISE THE TF-IDF, USING ONE OF THE MANIFESTO'S

# manifesto = textfiles[8] # pick one manifesto

# text = open(f'{ directory }/{ manifesto }', 'r').read() # open the txt file
# text = text.lower() # make all lowercase
# document = nltk.word_tokenize(text) # make a list of words

# html = []

# for word in document:
# 	tf_count, idf_count, tfidf_value = tfidf(word, document, corpus)
# 	element = f'<span style="font-size:{ tfidf_value * 1000 }px;">{ word } </span>'
# 	html.append(element)

# # write html to file
# out = open('tfidf.html', 'w')
# html = "".join(html) # convert the list to a string
# out.write(html)
# out.close()