bo-graduation/200312/categorization_of_files/categorization.py

import collections

# this script was adapted from:
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py


# open and read file
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
a = file.read()

# my stopwords are common words I don't want to count, like "a", "an", "the".
stopwords = set(line.strip() for line in open('stopwords.txt'))

# dictionary
wordcount = {}

# spliting words from punctuation so "book" and "book!" counts as the same word
for word in a.lower().split():
    word = word.replace(".","")
    word = word.replace(",","")
    word = word.replace(":","")
    word = word.replace("\"","")
    word = word.replace("!","")
    word = word.replace("â€œ","")
    word = word.replace("â€˜","")
    word = word.replace("*","")

# counting
    if word not in stopwords:
        if word not in wordcount:
            wordcount[word] = 1
        else:
            wordcount[word] += 1

# print x most common words
# n_print = int(input("How many most common words to print: "))
n_print = int(5)
print("\nMost used colonial words are:")
word_counter = collections.Counter(wordcount)
for word, count in word_counter.most_common(n_print):
    print(word,"—", count)

# categories

# words that are inside the category Library Studies
library_studies = set(line.strip() for line in open('library_studies.txt'))

for word, count in word_counter.most_common(n_print):
    if word in library_studies:
        print("\nWe suggest the following categorization for this file:\nLibrary Studies\n")
        break
else:
    print("\nWe don't have any suggestion of categorization for this file.\n")

# Close the file
file.close()