import collections # from termcolor import colored # this script was adapted from: # https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0 # https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py # open and read file file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8") a = file.read() # f = open("tiktok.txt", "r") # print(f.read()) # my stopwords are common words I don't want to count, like "a", "an", "the". stopwords = set(line.strip() for line in open('stopwords.txt')) # dictionary wordcount = {} # spliting words from punctuation so "book" and "book!" counts as the same word for word in a.lower().split(): word = word.replace(".","") word = word.replace(",","") word = word.replace(":","") word = word.replace("\"","") word = word.replace("!","") word = word.replace("“","") word = word.replace("‘","") word = word.replace("*","") word = word.replace("(","") word = word.replace(")","") # counting if word not in stopwords: if word not in wordcount: wordcount[word] = 1 else: wordcount[word] += 1 # print x most common words n_print = int(100) print("\nMost used colonial words are:") word_counter = collections.Counter(wordcount) for word, count in word_counter.most_common(n_print): print(word,"—", count) # word_counter = collections.Counter(wordcount) # for word, count in word_counter.most_common(n_print): # print(word,"—", count) # colonial texts in bold # for word in n_print: # if word in n_print: # wordcount.append(colored(word, 'white', 'on_red')) # else: # wordcount.append(t) # print(" ".join(colored(word, 'white', 'on_red')) # categories # words that are inside the category Library Studies library_studies = set(line.strip() for line in open('library_studies.txt')) for word, count in word_counter.most_common(n_print): if word in library_studies: print("\nWe suggest the following categorization for this platform:\nLibrary Studies\n") break else: print("\nThese are the TikTok's colonial words.\n") # Close the file file.close()