You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

83 lines
2.2 KiB
Python

5 years ago
import collections
# from termcolor import colored
# this script was adapted from:
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
# open and read file
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
a = file.read()
# f = open("tiktok.txt", "r")
# print(f.read())
# my stopwords are common words I don't want to count, like "a", "an", "the".
stopwords = set(line.strip() for line in open('stopwords.txt'))
# dictionary
wordcount = {}
# spliting words from punctuation so "book" and "book!" counts as the same word
for word in a.lower().split():
word = word.replace(".","")
word = word.replace(",","")
word = word.replace(":","")
word = word.replace("\"","")
word = word.replace("!","")
word = word.replace("“","")
word = word.replace("‘","")
word = word.replace("*","")
word = word.replace("(","")
word = word.replace(")","")
# counting
if word not in stopwords:
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
# print x most common words
n_print = int(100)
print("\nMost used colonial words are:")
word_counter = collections.Counter(wordcount)
for word, count in word_counter.most_common(n_print):
print(word,"", count)
# word_counter = collections.Counter(wordcount)
# for word, count in word_counter.most_common(n_print):
# print(word,"—", count)
# colonial texts in bold
# for word in n_print:
# if word in n_print:
# wordcount.append(colored(word, 'white', 'on_red'))
# else:
# wordcount.append(t)
# print(" ".join(colored(word, 'white', 'on_red'))
# categories
# words that are inside the category Library Studies
library_studies = set(line.strip() for line in open('library_studies.txt'))
for word, count in word_counter.most_common(n_print):
if word in library_studies:
print("\nWe suggest the following categorization for this platform:\nLibrary Studies\n")
break
else:
print("\nThese are the TikTok's colonial words.\n")
# Close the file
file.close()