You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
87 lines
1.9 KiB
Python
87 lines
1.9 KiB
Python
4 years ago
|
import os
|
||
|
import math
|
||
|
import nltk
|
||
|
|
||
|
def tfidf(query, document, corpus):
|
||
|
|
||
|
# Term Frequency
|
||
|
tf_count = 0
|
||
|
for word in document:
|
||
|
if query == word:
|
||
|
tf_count += 1
|
||
|
tf = tf_count/len(document)
|
||
|
|
||
|
# Inverse Document Frequency
|
||
|
idf_count = 0
|
||
|
for document in corpus:
|
||
|
if query in document:
|
||
|
idf_count += 1
|
||
|
idf = math.log(len(corpus)/idf_count)
|
||
|
|
||
|
# Term Frequency / Inverse Document Frequency
|
||
|
tfidf_value = tf * idf
|
||
|
|
||
|
return tf_count, idf_count, tfidf_value
|
||
|
|
||
|
# ---
|
||
|
|
||
|
# USE THE TF-IDF ALGORITHM
|
||
|
|
||
|
directory = './txt/'
|
||
|
textfiles = os.listdir(directory)
|
||
|
|
||
|
corpus = []
|
||
|
|
||
|
for manifesto in textfiles:
|
||
|
text = open(f'{ directory }/{ manifesto }', 'r').read() # open the txt file
|
||
|
text = text.lower() # make all lowercase
|
||
|
document = nltk.word_tokenize(text) # make a list of words
|
||
|
corpus.append(document)
|
||
|
|
||
|
# q = 'world' # query
|
||
|
# d = corpus[0] # one document
|
||
|
# c = corpus # full corpus
|
||
|
|
||
|
# tf_count, idf_count, tfidf_value = tfidf(q, d, c)
|
||
|
|
||
|
# print('query:', q)
|
||
|
# print('TF:', tf_count)
|
||
|
# print('IDF:', idf_count)
|
||
|
# print('TF-IDF:', tfidf_value)
|
||
|
|
||
|
# ---
|
||
|
|
||
|
# PRINT OUT ALL TF-IDF VALUES
|
||
|
|
||
|
# for document in corpus:
|
||
|
# for word in document:
|
||
|
# tf_count, idf_count, tfidf_value = tfidf(word, document, corpus)
|
||
|
|
||
|
# print('query:', word)
|
||
|
# print('TF:', tf_count)
|
||
|
# print('IDF:', idf_count)
|
||
|
# print('TF-IDF:', tfidf_value)
|
||
|
# print('---')
|
||
|
|
||
|
# ---
|
||
|
|
||
|
# VISUALISE THE TF-IDF, USING ONE OF THE MANIFESTO'S
|
||
|
|
||
|
# manifesto = textfiles[8] # pick one manifesto
|
||
|
|
||
|
# text = open(f'{ directory }/{ manifesto }', 'r').read() # open the txt file
|
||
|
# text = text.lower() # make all lowercase
|
||
|
# document = nltk.word_tokenize(text) # make a list of words
|
||
|
|
||
|
# html = []
|
||
|
|
||
|
# for word in document:
|
||
|
# tf_count, idf_count, tfidf_value = tfidf(word, document, corpus)
|
||
|
# element = f'<span style="font-size:{ tfidf_value * 1000 }px;">{ word } </span>'
|
||
|
# html.append(element)
|
||
|
|
||
|
# # write html to file
|
||
|
# out = open('tfidf.html', 'w')
|
||
|
# html = "".join(html) # convert the list to a string
|
||
|
# out.write(html)
|
||
|
# out.close()
|