You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
58 lines
1.8 KiB
Python
58 lines
1.8 KiB
Python
5 years ago
|
import collections
|
||
|
|
||
|
# this script was adapted from:
|
||
|
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
|
||
|
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
|
||
|
|
||
|
|
||
|
# open and read file
|
||
|
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
|
||
|
a = file.read()
|
||
|
|
||
|
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
||
|
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
||
|
|
||
|
# dictionary
|
||
|
wordcount = {}
|
||
|
|
||
|
# spliting words from punctuation so "book" and "book!" counts as the same word
|
||
|
for word in a.lower().split():
|
||
|
word = word.replace(".","")
|
||
|
word = word.replace(",","")
|
||
|
word = word.replace(":","")
|
||
|
word = word.replace("\"","")
|
||
|
word = word.replace("!","")
|
||
|
word = word.replace("“","")
|
||
|
word = word.replace("‘","")
|
||
|
word = word.replace("*","")
|
||
|
|
||
|
# counting
|
||
|
if word not in stopwords:
|
||
|
if word not in wordcount:
|
||
|
wordcount[word] = 1
|
||
|
else:
|
||
|
wordcount[word] += 1
|
||
|
|
||
|
# print x most common words
|
||
|
# n_print = int(input("How many most common words to print: "))
|
||
|
n_print = int(5)
|
||
|
print("\nMost used colonial words are:")
|
||
|
word_counter = collections.Counter(wordcount)
|
||
|
for word, count in word_counter.most_common(n_print):
|
||
|
print(word,"—", count)
|
||
|
|
||
|
# categories
|
||
|
|
||
|
# words that are inside the category Library Studies
|
||
|
library_studies = set(line.strip() for line in open('library_studies.txt'))
|
||
|
|
||
|
for word, count in word_counter.most_common(n_print):
|
||
|
if word in library_studies:
|
||
|
print("\nWe suggest the following categorization for this file:\nLibrary Studies\n")
|
||
|
break
|
||
|
else:
|
||
|
print("\nWe don't have any suggestion of categorization for this file.\n")
|
||
|
|
||
|
# Close the file
|
||
|
file.close()
|