first commit
After Width: | Height: | Size: 87 KiB |
After Width: | Height: | Size: 105 KiB |
After Width: | Height: | Size: 137 KiB |
After Width: | Height: | Size: 431 KiB |
After Width: | Height: | Size: 223 KiB |
After Width: | Height: | Size: 290 KiB |
After Width: | Height: | Size: 271 KiB |
After Width: | Height: | Size: 1.5 MiB |
After Width: | Height: | Size: 502 KiB |
After Width: | Height: | Size: 341 KiB |
@ -0,0 +1,57 @@
|
||||
import collections
|
||||
|
||||
# this script was adapted from:
|
||||
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
|
||||
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
|
||||
|
||||
|
||||
# open and read file
|
||||
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
|
||||
a = file.read()
|
||||
|
||||
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
||||
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
||||
|
||||
# dictionary
|
||||
wordcount = {}
|
||||
|
||||
# spliting words from punctuation so "book" and "book!" counts as the same word
|
||||
for word in a.lower().split():
|
||||
word = word.replace(".","")
|
||||
word = word.replace(",","")
|
||||
word = word.replace(":","")
|
||||
word = word.replace("\"","")
|
||||
word = word.replace("!","")
|
||||
word = word.replace("“","")
|
||||
word = word.replace("‘","")
|
||||
word = word.replace("*","")
|
||||
|
||||
# counting
|
||||
if word not in stopwords:
|
||||
if word not in wordcount:
|
||||
wordcount[word] = 1
|
||||
else:
|
||||
wordcount[word] += 1
|
||||
|
||||
# print x most common words
|
||||
# n_print = int(input("How many most common words to print: "))
|
||||
n_print = int(5)
|
||||
print("\nMost used colonial words are:")
|
||||
word_counter = collections.Counter(wordcount)
|
||||
for word, count in word_counter.most_common(n_print):
|
||||
print(word,"—", count)
|
||||
|
||||
# categories
|
||||
|
||||
# words that are inside the category Library Studies
|
||||
library_studies = set(line.strip() for line in open('library_studies.txt'))
|
||||
|
||||
for word, count in word_counter.most_common(n_print):
|
||||
if word in library_studies:
|
||||
print("\nWe suggest the following categorization for this file:\nLibrary Studies\n")
|
||||
break
|
||||
else:
|
||||
print("\nWe don't have any suggestion of categorization for this file.\n")
|
||||
|
||||
# Close the file
|
||||
file.close()
|
@ -0,0 +1,82 @@
|
||||
import collections
|
||||
# from termcolor import colored
|
||||
# this script was adapted from:
|
||||
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
|
||||
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
|
||||
|
||||
|
||||
# open and read file
|
||||
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
|
||||
a = file.read()
|
||||
|
||||
# f = open("tiktok.txt", "r")
|
||||
# print(f.read())
|
||||
|
||||
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
||||
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
||||
|
||||
# dictionary
|
||||
wordcount = {}
|
||||
|
||||
# spliting words from punctuation so "book" and "book!" counts as the same word
|
||||
for word in a.lower().split():
|
||||
word = word.replace(".","")
|
||||
word = word.replace(",","")
|
||||
word = word.replace(":","")
|
||||
word = word.replace("\"","")
|
||||
word = word.replace("!","")
|
||||
word = word.replace("“","")
|
||||
word = word.replace("‘","")
|
||||
word = word.replace("*","")
|
||||
word = word.replace("(","")
|
||||
word = word.replace(")","")
|
||||
|
||||
# counting
|
||||
if word not in stopwords:
|
||||
if word not in wordcount:
|
||||
wordcount[word] = 1
|
||||
else:
|
||||
wordcount[word] += 1
|
||||
|
||||
# print x most common words
|
||||
n_print = int(100)
|
||||
print("\nMost used colonial words are:")
|
||||
|
||||
word_counter = collections.Counter(wordcount)
|
||||
for word, count in word_counter.most_common(n_print):
|
||||
print(word,"—", count)
|
||||
|
||||
|
||||
# word_counter = collections.Counter(wordcount)
|
||||
# for word, count in word_counter.most_common(n_print):
|
||||
# print(word,"—", count)
|
||||
|
||||
|
||||
# colonial texts in bold
|
||||
# for word in n_print:
|
||||
# if word in n_print:
|
||||
# wordcount.append(colored(word, 'white', 'on_red'))
|
||||
# else:
|
||||
# wordcount.append(t)
|
||||
|
||||
|
||||
# print(" ".join(colored(word, 'white', 'on_red'))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# categories
|
||||
|
||||
# words that are inside the category Library Studies
|
||||
library_studies = set(line.strip() for line in open('library_studies.txt'))
|
||||
|
||||
for word, count in word_counter.most_common(n_print):
|
||||
if word in library_studies:
|
||||
print("\nWe suggest the following categorization for this platform:\nLibrary Studies\n")
|
||||
break
|
||||
else:
|
||||
print("\nThese are the TikTok's colonial words.\n")
|
||||
|
||||
# Close the file
|
||||
file.close()
|
After Width: | Height: | Size: 234 KiB |
@ -0,0 +1,67 @@
|
||||
-
|
||||
a
|
||||
about
|
||||
all
|
||||
an
|
||||
and
|
||||
are
|
||||
as
|
||||
at
|
||||
be
|
||||
but
|
||||
by
|
||||
can
|
||||
do
|
||||
for
|
||||
from
|
||||
get
|
||||
had
|
||||
has
|
||||
have
|
||||
he
|
||||
I
|
||||
i
|
||||
if
|
||||
in
|
||||
into
|
||||
is
|
||||
it
|
||||
its
|
||||
me
|
||||
more
|
||||
my
|
||||
not
|
||||
of
|
||||
on
|
||||
one
|
||||
or
|
||||
other
|
||||
out
|
||||
so
|
||||
some
|
||||
such
|
||||
than
|
||||
that
|
||||
the
|
||||
their
|
||||
them
|
||||
then
|
||||
there
|
||||
these
|
||||
they
|
||||
this
|
||||
those
|
||||
to
|
||||
up
|
||||
was
|
||||
were
|
||||
what
|
||||
when
|
||||
which
|
||||
who
|
||||
whom
|
||||
will
|
||||
with
|
||||
would
|
||||
|
|
||||
—
|
@ -0,0 +1,57 @@
|
||||
import collections
|
||||
|
||||
# this script was adapted from:
|
||||
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
|
||||
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
|
||||
|
||||
|
||||
# open and read file
|
||||
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
|
||||
a = file.read()
|
||||
|
||||
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
||||
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
||||
|
||||
# dictionary
|
||||
wordcount = {}
|
||||
|
||||
# spliting words from punctuation so "book" and "book!" counts as the same word
|
||||
for word in a.lower().split():
|
||||
word = word.replace(".","")
|
||||
word = word.replace(",","")
|
||||
word = word.replace(":","")
|
||||
word = word.replace("\"","")
|
||||
word = word.replace("!","")
|
||||
word = word.replace("“","")
|
||||
word = word.replace("‘","")
|
||||
word = word.replace("*","")
|
||||
|
||||
# counting
|
||||
if word not in stopwords:
|
||||
if word not in wordcount:
|
||||
wordcount[word] = 1
|
||||
else:
|
||||
wordcount[word] += 1
|
||||
|
||||
# print x most common words
|
||||
# n_print = int(input("How many most common words to print: "))
|
||||
n_print = int(5)
|
||||
print("\nMost used colonial words are:")
|
||||
word_counter = collections.Counter(wordcount)
|
||||
for word, count in word_counter.most_common(n_print):
|
||||
print(word,"—", count)
|
||||
|
||||
# categories
|
||||
|
||||
# words that are inside the category Library Studies
|
||||
library_studies = set(line.strip() for line in open('library_studies.txt'))
|
||||
|
||||
for word, count in word_counter.most_common(n_print):
|
||||
if word in library_studies:
|
||||
print("\nWe suggest the following categorization for this file:\nLibrary Studies\n")
|
||||
break
|
||||
else:
|
||||
print("\nWe don't have any suggestion of categorization for this file.\n")
|
||||
|
||||
# Close the file
|
||||
file.close()
|
@ -0,0 +1,82 @@
|
||||
import collections
|
||||
# from termcolor import colored
|
||||
# this script was adapted from:
|
||||
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
|
||||
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
|
||||
|
||||
|
||||
# open and read file
|
||||
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
|
||||
a = file.read()
|
||||
|
||||
# f = open("tiktok.txt", "r")
|
||||
# print(f.read())
|
||||
|
||||
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
||||
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
||||
|
||||
# dictionary
|
||||
wordcount = {}
|
||||
|
||||
# spliting words from punctuation so "book" and "book!" counts as the same word
|
||||
for word in a.lower().split():
|
||||
word = word.replace(".","")
|
||||
word = word.replace(",","")
|
||||
word = word.replace(":","")
|
||||
word = word.replace("\"","")
|
||||
word = word.replace("!","")
|
||||
word = word.replace("“","")
|
||||
word = word.replace("‘","")
|
||||
word = word.replace("*","")
|
||||
word = word.replace("(","")
|
||||
word = word.replace(")","")
|
||||
|
||||
# counting
|
||||
if word not in stopwords:
|
||||
if word not in wordcount:
|
||||
wordcount[word] = 1
|
||||
else:
|
||||
wordcount[word] += 1
|
||||
|
||||
# print x most common words
|
||||
n_print = int(100)
|
||||
print("\nMost used colonial words are:")
|
||||
|
||||
word_counter = collections.Counter(wordcount)
|
||||
for word, count in word_counter.most_common(n_print):
|
||||
print(word,"—", count)
|
||||
|
||||
|
||||
# word_counter = collections.Counter(wordcount)
|
||||
# for word, count in word_counter.most_common(n_print):
|
||||
# print(word,"—", count)
|
||||
|
||||
|
||||
# colonial texts in bold
|
||||
# for word in n_print:
|
||||
# if word in n_print:
|
||||
# wordcount.append(colored(word, 'white', 'on_red'))
|
||||
# else:
|
||||
# wordcount.append(t)
|
||||
|
||||
|
||||
# print(" ".join(colored(word, 'white', 'on_red'))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# categories
|
||||
|
||||
# words that are inside the category Library Studies
|
||||
library_studies = set(line.strip() for line in open('library_studies.txt'))
|
||||
|
||||
for word, count in word_counter.most_common(n_print):
|
||||
if word in library_studies:
|
||||
print("\nWe suggest the following categorization for this platform:\nLibrary Studies\n")
|
||||
break
|
||||
else:
|
||||
print("\nThese are the TikTok's colonial words.\n")
|
||||
|
||||
# Close the file
|
||||
file.close()
|
After Width: | Height: | Size: 234 KiB |
@ -0,0 +1,19 @@
|
||||
archives
|
||||
author
|
||||
bibliographic
|
||||
bibliotheca
|
||||
book
|
||||
bookcase
|
||||
books
|
||||
bookshelf
|
||||
bookstore
|
||||
catalogue
|
||||
e-book
|
||||
librarian
|
||||
librarianship
|
||||
library
|
||||
literature
|
||||
manuscripts
|
||||
papyrus
|
||||
read
|
||||
reading
|
@ -0,0 +1,67 @@
|
||||
-
|
||||
a
|
||||
about
|
||||
all
|
||||
an
|
||||
and
|
||||
are
|
||||
as
|
||||
at
|
||||
be
|
||||
but
|
||||
by
|
||||
can
|
||||
do
|
||||
for
|
||||
from
|
||||
get
|
||||
had
|
||||
has
|
||||
have
|
||||
he
|
||||
I
|
||||
i
|
||||
if
|
||||
in
|
||||
into
|
||||
is
|
||||
it
|
||||
its
|
||||
me
|
||||
more
|
||||
my
|
||||
not
|
||||
of
|
||||
on
|
||||
one
|
||||
or
|
||||
other
|
||||
out
|
||||
so
|
||||
some
|
||||
such
|
||||
than
|
||||
that
|
||||
the
|
||||
their
|
||||
them
|
||||
then
|
||||
there
|
||||
these
|
||||
they
|
||||
this
|
||||
those
|
||||
to
|
||||
up
|
||||
was
|
||||
were
|
||||
what
|
||||
when
|
||||
which
|
||||
who
|
||||
whom
|
||||
will
|
||||
with
|
||||
would
|
||||
|
|
||||
—
|
After Width: | Height: | Size: 282 KiB |