You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

87 lines
1.9 KiB
Python

import os
import math
import nltk
def tfidf(query, document, corpus):
# Term Frequency
tf_count = 0
for word in document:
if query == word:
tf_count += 1
tf = tf_count/len(document)
# Inverse Document Frequency
idf_count = 0
for document in corpus:
if query in document:
idf_count += 1
idf = math.log(len(corpus)/idf_count)
# Term Frequency / Inverse Document Frequency
tfidf_value = tf * idf
return tf_count, idf_count, tfidf_value
# ---
# USE THE TF-IDF ALGORITHM
directory = './txt/'
textfiles = os.listdir(directory)
corpus = []
for manifesto in textfiles:
text = open(f'{ directory }/{ manifesto }', 'r').read() # open the txt file
text = text.lower() # make all lowercase
document = nltk.word_tokenize(text) # make a list of words
corpus.append(document)
# q = 'world' # query
# d = corpus[0] # one document
# c = corpus # full corpus
# tf_count, idf_count, tfidf_value = tfidf(q, d, c)
# print('query:', q)
# print('TF:', tf_count)
# print('IDF:', idf_count)
# print('TF-IDF:', tfidf_value)
# ---
# PRINT OUT ALL TF-IDF VALUES
# for document in corpus:
# for word in document:
# tf_count, idf_count, tfidf_value = tfidf(word, document, corpus)
# print('query:', word)
# print('TF:', tf_count)
# print('IDF:', idf_count)
# print('TF-IDF:', tfidf_value)
# print('---')
# ---
# VISUALISE THE TF-IDF, USING ONE OF THE MANIFESTO'S
# manifesto = textfiles[8] # pick one manifesto
# text = open(f'{ directory }/{ manifesto }', 'r').read() # open the txt file
# text = text.lower() # make all lowercase
# document = nltk.word_tokenize(text) # make a list of words
# html = []
# for word in document:
# tf_count, idf_count, tfidf_value = tfidf(word, document, corpus)
# element = f'<span style="font-size:{ tfidf_value * 1000 }px;">{ word } </span>'
# html.append(element)
# # write html to file
# out = open('tfidf.html', 'w')
# html = "".join(html) # convert the list to a string
# out.write(html)
# out.close()