bo-graduation/200312/categorization_of_files/colonialwords.py

import collections
# from termcolor import colored
# this script was adapted from:
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py


# open and read file
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
a = file.read()

# f = open("tiktok.txt", "r")
# print(f.read())

# my stopwords are common words I don't want to count, like "a", "an", "the".
stopwords = set(line.strip() for line in open('stopwords.txt'))

# dictionary
wordcount = {}

# spliting words from punctuation so "book" and "book!" counts as the same word
for word in a.lower().split():
    word = word.replace(".","")
    word = word.replace(",","")
    word = word.replace(":","")
    word = word.replace("\"","")
    word = word.replace("!","")
    word = word.replace("â€œ","")
    word = word.replace("â€˜","")
    word = word.replace("*","")
    word = word.replace("(","")
    word = word.replace(")","")

# counting
    if word not in stopwords:
        if word not in wordcount:
            wordcount[word] = 1
        else:
            wordcount[word] += 1

# print x most common words
n_print = int(100)
print("\nMost used colonial words are:")

word_counter = collections.Counter(wordcount)
for word, count in word_counter.most_common(n_print):
    print(word,"—", count)


# word_counter = collections.Counter(wordcount)
# for word, count in word_counter.most_common(n_print):
#     print(word,"—", count)


# colonial texts in bold 
# for word in n_print:
#     if word in n_print:
#         wordcount.append(colored(word, 'white', 'on_red'))
#     else:
#         wordcount.append(t)


# print(" ".join(colored(word, 'white', 'on_red'))


# categories

# words that are inside the category Library Studies
library_studies = set(line.strip() for line in open('library_studies.txt'))

for word, count in word_counter.most_common(n_print):
    if word in library_studies:
        print("\nWe suggest the following categorization for this platform:\nLibrary Studies\n")
        break
else:
    print("\nThese are the TikTok's colonial words.\n")

# Close the file
file.close()
first commit 5 years ago			`import collections`
			`# from termcolor import colored`
			`# this script was adapted from:`
			`# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0`
			`# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py`


			`# open and read file`
			`file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")`
			`a = file.read()`

			`# f = open("tiktok.txt", "r")`
			`# print(f.read())`

			`# my stopwords are common words I don't want to count, like "a", "an", "the".`
			`stopwords = set(line.strip() for line in open('stopwords.txt'))`

			`# dictionary`
			`wordcount = {}`

			`# spliting words from punctuation so "book" and "book!" counts as the same word`
			`for word in a.lower().split():`
			`word = word.replace(".","")`
			`word = word.replace(",","")`
			`word = word.replace(":","")`
			`word = word.replace("\"","")`
			`word = word.replace("!","")`
			`word = word.replace("â€œ","")`
			`word = word.replace("â€˜","")`
			`word = word.replace("*","")`
			`word = word.replace("(","")`
			`word = word.replace(")","")`

			`# counting`
			`if word not in stopwords:`
			`if word not in wordcount:`
			`wordcount[word] = 1`
			`else:`
			`wordcount[word] += 1`

			`# print x most common words`
			`n_print = int(100)`
			`print("\nMost used colonial words are:")`

			`word_counter = collections.Counter(wordcount)`
			`for word, count in word_counter.most_common(n_print):`
			`print(word,"—", count)`


			`# word_counter = collections.Counter(wordcount)`
			`# for word, count in word_counter.most_common(n_print):`
			`# print(word,"—", count)`


			`# colonial texts in bold`
			`# for word in n_print:`
			`# if word in n_print:`
			`# wordcount.append(colored(word, 'white', 'on_red'))`
			`# else:`
			`# wordcount.append(t)`


			`# print(" ".join(colored(word, 'white', 'on_red'))`





			`# categories`

			`# words that are inside the category Library Studies`
			`library_studies = set(line.strip() for line in open('library_studies.txt'))`

			`for word, count in word_counter.most_common(n_print):`
			`if word in library_studies:`
			`print("\nWe suggest the following categorization for this platform:\nLibrary Studies\n")`
			`break`
			`else:`
			`print("\nThese are the TikTok's colonial words.\n")`

			`# Close the file`
			`file.close()`