first commit
After Width: | Height: | Size: 87 KiB |
After Width: | Height: | Size: 105 KiB |
After Width: | Height: | Size: 137 KiB |
After Width: | Height: | Size: 431 KiB |
After Width: | Height: | Size: 223 KiB |
After Width: | Height: | Size: 290 KiB |
After Width: | Height: | Size: 271 KiB |
After Width: | Height: | Size: 1.5 MiB |
After Width: | Height: | Size: 502 KiB |
After Width: | Height: | Size: 341 KiB |
@ -0,0 +1,57 @@
|
|||||||
|
import collections
|
||||||
|
|
||||||
|
# this script was adapted from:
|
||||||
|
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
|
||||||
|
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
|
||||||
|
|
||||||
|
|
||||||
|
# open and read file
|
||||||
|
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
|
||||||
|
a = file.read()
|
||||||
|
|
||||||
|
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
||||||
|
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
||||||
|
|
||||||
|
# dictionary
|
||||||
|
wordcount = {}
|
||||||
|
|
||||||
|
# spliting words from punctuation so "book" and "book!" counts as the same word
|
||||||
|
for word in a.lower().split():
|
||||||
|
word = word.replace(".","")
|
||||||
|
word = word.replace(",","")
|
||||||
|
word = word.replace(":","")
|
||||||
|
word = word.replace("\"","")
|
||||||
|
word = word.replace("!","")
|
||||||
|
word = word.replace("“","")
|
||||||
|
word = word.replace("‘","")
|
||||||
|
word = word.replace("*","")
|
||||||
|
|
||||||
|
# counting
|
||||||
|
if word not in stopwords:
|
||||||
|
if word not in wordcount:
|
||||||
|
wordcount[word] = 1
|
||||||
|
else:
|
||||||
|
wordcount[word] += 1
|
||||||
|
|
||||||
|
# print x most common words
|
||||||
|
# n_print = int(input("How many most common words to print: "))
|
||||||
|
n_print = int(5)
|
||||||
|
print("\nMost used colonial words are:")
|
||||||
|
word_counter = collections.Counter(wordcount)
|
||||||
|
for word, count in word_counter.most_common(n_print):
|
||||||
|
print(word,"—", count)
|
||||||
|
|
||||||
|
# categories
|
||||||
|
|
||||||
|
# words that are inside the category Library Studies
|
||||||
|
library_studies = set(line.strip() for line in open('library_studies.txt'))
|
||||||
|
|
||||||
|
for word, count in word_counter.most_common(n_print):
|
||||||
|
if word in library_studies:
|
||||||
|
print("\nWe suggest the following categorization for this file:\nLibrary Studies\n")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("\nWe don't have any suggestion of categorization for this file.\n")
|
||||||
|
|
||||||
|
# Close the file
|
||||||
|
file.close()
|
@ -0,0 +1,82 @@
|
|||||||
|
import collections
|
||||||
|
# from termcolor import colored
|
||||||
|
# this script was adapted from:
|
||||||
|
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
|
||||||
|
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
|
||||||
|
|
||||||
|
|
||||||
|
# open and read file
|
||||||
|
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
|
||||||
|
a = file.read()
|
||||||
|
|
||||||
|
# f = open("tiktok.txt", "r")
|
||||||
|
# print(f.read())
|
||||||
|
|
||||||
|
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
||||||
|
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
||||||
|
|
||||||
|
# dictionary
|
||||||
|
wordcount = {}
|
||||||
|
|
||||||
|
# spliting words from punctuation so "book" and "book!" counts as the same word
|
||||||
|
for word in a.lower().split():
|
||||||
|
word = word.replace(".","")
|
||||||
|
word = word.replace(",","")
|
||||||
|
word = word.replace(":","")
|
||||||
|
word = word.replace("\"","")
|
||||||
|
word = word.replace("!","")
|
||||||
|
word = word.replace("“","")
|
||||||
|
word = word.replace("‘","")
|
||||||
|
word = word.replace("*","")
|
||||||
|
word = word.replace("(","")
|
||||||
|
word = word.replace(")","")
|
||||||
|
|
||||||
|
# counting
|
||||||
|
if word not in stopwords:
|
||||||
|
if word not in wordcount:
|
||||||
|
wordcount[word] = 1
|
||||||
|
else:
|
||||||
|
wordcount[word] += 1
|
||||||
|
|
||||||
|
# print x most common words
|
||||||
|
n_print = int(100)
|
||||||
|
print("\nMost used colonial words are:")
|
||||||
|
|
||||||
|
word_counter = collections.Counter(wordcount)
|
||||||
|
for word, count in word_counter.most_common(n_print):
|
||||||
|
print(word,"—", count)
|
||||||
|
|
||||||
|
|
||||||
|
# word_counter = collections.Counter(wordcount)
|
||||||
|
# for word, count in word_counter.most_common(n_print):
|
||||||
|
# print(word,"—", count)
|
||||||
|
|
||||||
|
|
||||||
|
# colonial texts in bold
|
||||||
|
# for word in n_print:
|
||||||
|
# if word in n_print:
|
||||||
|
# wordcount.append(colored(word, 'white', 'on_red'))
|
||||||
|
# else:
|
||||||
|
# wordcount.append(t)
|
||||||
|
|
||||||
|
|
||||||
|
# print(" ".join(colored(word, 'white', 'on_red'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# categories
|
||||||
|
|
||||||
|
# words that are inside the category Library Studies
|
||||||
|
library_studies = set(line.strip() for line in open('library_studies.txt'))
|
||||||
|
|
||||||
|
for word, count in word_counter.most_common(n_print):
|
||||||
|
if word in library_studies:
|
||||||
|
print("\nWe suggest the following categorization for this platform:\nLibrary Studies\n")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("\nThese are the TikTok's colonial words.\n")
|
||||||
|
|
||||||
|
# Close the file
|
||||||
|
file.close()
|
After Width: | Height: | Size: 234 KiB |
@ -0,0 +1,67 @@
|
|||||||
|
-
|
||||||
|
a
|
||||||
|
about
|
||||||
|
all
|
||||||
|
an
|
||||||
|
and
|
||||||
|
are
|
||||||
|
as
|
||||||
|
at
|
||||||
|
be
|
||||||
|
but
|
||||||
|
by
|
||||||
|
can
|
||||||
|
do
|
||||||
|
for
|
||||||
|
from
|
||||||
|
get
|
||||||
|
had
|
||||||
|
has
|
||||||
|
have
|
||||||
|
he
|
||||||
|
I
|
||||||
|
i
|
||||||
|
if
|
||||||
|
in
|
||||||
|
into
|
||||||
|
is
|
||||||
|
it
|
||||||
|
its
|
||||||
|
me
|
||||||
|
more
|
||||||
|
my
|
||||||
|
not
|
||||||
|
of
|
||||||
|
on
|
||||||
|
one
|
||||||
|
or
|
||||||
|
other
|
||||||
|
out
|
||||||
|
so
|
||||||
|
some
|
||||||
|
such
|
||||||
|
than
|
||||||
|
that
|
||||||
|
the
|
||||||
|
their
|
||||||
|
them
|
||||||
|
then
|
||||||
|
there
|
||||||
|
these
|
||||||
|
they
|
||||||
|
this
|
||||||
|
those
|
||||||
|
to
|
||||||
|
up
|
||||||
|
was
|
||||||
|
were
|
||||||
|
what
|
||||||
|
when
|
||||||
|
which
|
||||||
|
who
|
||||||
|
whom
|
||||||
|
will
|
||||||
|
with
|
||||||
|
would
|
||||||
|
|
|
||||||
|
—
|
@ -0,0 +1,57 @@
|
|||||||
|
import collections
|
||||||
|
|
||||||
|
# this script was adapted from:
|
||||||
|
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
|
||||||
|
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
|
||||||
|
|
||||||
|
|
||||||
|
# open and read file
|
||||||
|
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
|
||||||
|
a = file.read()
|
||||||
|
|
||||||
|
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
||||||
|
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
||||||
|
|
||||||
|
# dictionary
|
||||||
|
wordcount = {}
|
||||||
|
|
||||||
|
# spliting words from punctuation so "book" and "book!" counts as the same word
|
||||||
|
for word in a.lower().split():
|
||||||
|
word = word.replace(".","")
|
||||||
|
word = word.replace(",","")
|
||||||
|
word = word.replace(":","")
|
||||||
|
word = word.replace("\"","")
|
||||||
|
word = word.replace("!","")
|
||||||
|
word = word.replace("“","")
|
||||||
|
word = word.replace("‘","")
|
||||||
|
word = word.replace("*","")
|
||||||
|
|
||||||
|
# counting
|
||||||
|
if word not in stopwords:
|
||||||
|
if word not in wordcount:
|
||||||
|
wordcount[word] = 1
|
||||||
|
else:
|
||||||
|
wordcount[word] += 1
|
||||||
|
|
||||||
|
# print x most common words
|
||||||
|
# n_print = int(input("How many most common words to print: "))
|
||||||
|
n_print = int(5)
|
||||||
|
print("\nMost used colonial words are:")
|
||||||
|
word_counter = collections.Counter(wordcount)
|
||||||
|
for word, count in word_counter.most_common(n_print):
|
||||||
|
print(word,"—", count)
|
||||||
|
|
||||||
|
# categories
|
||||||
|
|
||||||
|
# words that are inside the category Library Studies
|
||||||
|
library_studies = set(line.strip() for line in open('library_studies.txt'))
|
||||||
|
|
||||||
|
for word, count in word_counter.most_common(n_print):
|
||||||
|
if word in library_studies:
|
||||||
|
print("\nWe suggest the following categorization for this file:\nLibrary Studies\n")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("\nWe don't have any suggestion of categorization for this file.\n")
|
||||||
|
|
||||||
|
# Close the file
|
||||||
|
file.close()
|
@ -0,0 +1,82 @@
|
|||||||
|
import collections
|
||||||
|
# from termcolor import colored
|
||||||
|
# this script was adapted from:
|
||||||
|
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
|
||||||
|
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
|
||||||
|
|
||||||
|
|
||||||
|
# open and read file
|
||||||
|
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
|
||||||
|
a = file.read()
|
||||||
|
|
||||||
|
# f = open("tiktok.txt", "r")
|
||||||
|
# print(f.read())
|
||||||
|
|
||||||
|
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
||||||
|
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
||||||
|
|
||||||
|
# dictionary
|
||||||
|
wordcount = {}
|
||||||
|
|
||||||
|
# spliting words from punctuation so "book" and "book!" counts as the same word
|
||||||
|
for word in a.lower().split():
|
||||||
|
word = word.replace(".","")
|
||||||
|
word = word.replace(",","")
|
||||||
|
word = word.replace(":","")
|
||||||
|
word = word.replace("\"","")
|
||||||
|
word = word.replace("!","")
|
||||||
|
word = word.replace("“","")
|
||||||
|
word = word.replace("‘","")
|
||||||
|
word = word.replace("*","")
|
||||||
|
word = word.replace("(","")
|
||||||
|
word = word.replace(")","")
|
||||||
|
|
||||||
|
# counting
|
||||||
|
if word not in stopwords:
|
||||||
|
if word not in wordcount:
|
||||||
|
wordcount[word] = 1
|
||||||
|
else:
|
||||||
|
wordcount[word] += 1
|
||||||
|
|
||||||
|
# print x most common words
|
||||||
|
n_print = int(100)
|
||||||
|
print("\nMost used colonial words are:")
|
||||||
|
|
||||||
|
word_counter = collections.Counter(wordcount)
|
||||||
|
for word, count in word_counter.most_common(n_print):
|
||||||
|
print(word,"—", count)
|
||||||
|
|
||||||
|
|
||||||
|
# word_counter = collections.Counter(wordcount)
|
||||||
|
# for word, count in word_counter.most_common(n_print):
|
||||||
|
# print(word,"—", count)
|
||||||
|
|
||||||
|
|
||||||
|
# colonial texts in bold
|
||||||
|
# for word in n_print:
|
||||||
|
# if word in n_print:
|
||||||
|
# wordcount.append(colored(word, 'white', 'on_red'))
|
||||||
|
# else:
|
||||||
|
# wordcount.append(t)
|
||||||
|
|
||||||
|
|
||||||
|
# print(" ".join(colored(word, 'white', 'on_red'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# categories
|
||||||
|
|
||||||
|
# words that are inside the category Library Studies
|
||||||
|
library_studies = set(line.strip() for line in open('library_studies.txt'))
|
||||||
|
|
||||||
|
for word, count in word_counter.most_common(n_print):
|
||||||
|
if word in library_studies:
|
||||||
|
print("\nWe suggest the following categorization for this platform:\nLibrary Studies\n")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("\nThese are the TikTok's colonial words.\n")
|
||||||
|
|
||||||
|
# Close the file
|
||||||
|
file.close()
|
After Width: | Height: | Size: 234 KiB |
@ -0,0 +1,19 @@
|
|||||||
|
archives
|
||||||
|
author
|
||||||
|
bibliographic
|
||||||
|
bibliotheca
|
||||||
|
book
|
||||||
|
bookcase
|
||||||
|
books
|
||||||
|
bookshelf
|
||||||
|
bookstore
|
||||||
|
catalogue
|
||||||
|
e-book
|
||||||
|
librarian
|
||||||
|
librarianship
|
||||||
|
library
|
||||||
|
literature
|
||||||
|
manuscripts
|
||||||
|
papyrus
|
||||||
|
read
|
||||||
|
reading
|
@ -0,0 +1,67 @@
|
|||||||
|
-
|
||||||
|
a
|
||||||
|
about
|
||||||
|
all
|
||||||
|
an
|
||||||
|
and
|
||||||
|
are
|
||||||
|
as
|
||||||
|
at
|
||||||
|
be
|
||||||
|
but
|
||||||
|
by
|
||||||
|
can
|
||||||
|
do
|
||||||
|
for
|
||||||
|
from
|
||||||
|
get
|
||||||
|
had
|
||||||
|
has
|
||||||
|
have
|
||||||
|
he
|
||||||
|
I
|
||||||
|
i
|
||||||
|
if
|
||||||
|
in
|
||||||
|
into
|
||||||
|
is
|
||||||
|
it
|
||||||
|
its
|
||||||
|
me
|
||||||
|
more
|
||||||
|
my
|
||||||
|
not
|
||||||
|
of
|
||||||
|
on
|
||||||
|
one
|
||||||
|
or
|
||||||
|
other
|
||||||
|
out
|
||||||
|
so
|
||||||
|
some
|
||||||
|
such
|
||||||
|
than
|
||||||
|
that
|
||||||
|
the
|
||||||
|
their
|
||||||
|
them
|
||||||
|
then
|
||||||
|
there
|
||||||
|
these
|
||||||
|
they
|
||||||
|
this
|
||||||
|
those
|
||||||
|
to
|
||||||
|
up
|
||||||
|
was
|
||||||
|
were
|
||||||
|
what
|
||||||
|
when
|
||||||
|
which
|
||||||
|
who
|
||||||
|
whom
|
||||||
|
will
|
||||||
|
with
|
||||||
|
would
|
||||||
|
|
|
||||||
|
—
|
After Width: | Height: | Size: 282 KiB |