second script
parent
b1c83b85f6
commit
c6e1d2215d
Binary file not shown.
Binary file not shown.
Before Width: | Height: | Size: 87 KiB |
Binary file not shown.
Before Width: | Height: | Size: 105 KiB |
Binary file not shown.
Before Width: | Height: | Size: 223 KiB |
@ -1,57 +0,0 @@
|
|||||||
import collections
|
|
||||||
|
|
||||||
# this script was adapted from:
|
|
||||||
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
|
|
||||||
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
|
|
||||||
|
|
||||||
|
|
||||||
# open and read file
|
|
||||||
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
|
|
||||||
a = file.read()
|
|
||||||
|
|
||||||
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
|
||||||
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
|
||||||
|
|
||||||
# dictionary
|
|
||||||
wordcount = {}
|
|
||||||
|
|
||||||
# spliting words from punctuation so "book" and "book!" counts as the same word
|
|
||||||
for word in a.lower().split():
|
|
||||||
word = word.replace(".","")
|
|
||||||
word = word.replace(",","")
|
|
||||||
word = word.replace(":","")
|
|
||||||
word = word.replace("\"","")
|
|
||||||
word = word.replace("!","")
|
|
||||||
word = word.replace("“","")
|
|
||||||
word = word.replace("‘","")
|
|
||||||
word = word.replace("*","")
|
|
||||||
|
|
||||||
# counting
|
|
||||||
if word not in stopwords:
|
|
||||||
if word not in wordcount:
|
|
||||||
wordcount[word] = 1
|
|
||||||
else:
|
|
||||||
wordcount[word] += 1
|
|
||||||
|
|
||||||
# print x most common words
|
|
||||||
# n_print = int(input("How many most common words to print: "))
|
|
||||||
n_print = int(5)
|
|
||||||
print("\nMost used colonial words are:")
|
|
||||||
word_counter = collections.Counter(wordcount)
|
|
||||||
for word, count in word_counter.most_common(n_print):
|
|
||||||
print(word,"—", count)
|
|
||||||
|
|
||||||
# categories
|
|
||||||
|
|
||||||
# words that are inside the category Library Studies
|
|
||||||
library_studies = set(line.strip() for line in open('library_studies.txt'))
|
|
||||||
|
|
||||||
for word, count in word_counter.most_common(n_print):
|
|
||||||
if word in library_studies:
|
|
||||||
print("\nWe suggest the following categorization for this file:\nLibrary Studies\n")
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
print("\nWe don't have any suggestion of categorization for this file.\n")
|
|
||||||
|
|
||||||
# Close the file
|
|
||||||
file.close()
|
|
@ -1,82 +0,0 @@
|
|||||||
import collections
|
|
||||||
# from termcolor import colored
|
|
||||||
# this script was adapted from:
|
|
||||||
# https://towardsdatascience.com/very-simple-python-script-for-extracting-most-common-words-from-a-story-1e3570d0b9d0
|
|
||||||
# https://git.xpub.nl/rita/categorization_of_files/src/branch/master/categorization.py
|
|
||||||
|
|
||||||
|
|
||||||
# open and read file
|
|
||||||
file = open(input("\nwhich platform's Terms of Service do you want to look at: \n"), encoding="utf8")
|
|
||||||
a = file.read()
|
|
||||||
|
|
||||||
# f = open("tiktok.txt", "r")
|
|
||||||
# print(f.read())
|
|
||||||
|
|
||||||
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
|
||||||
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
|
||||||
|
|
||||||
# dictionary
|
|
||||||
wordcount = {}
|
|
||||||
|
|
||||||
# spliting words from punctuation so "book" and "book!" counts as the same word
|
|
||||||
for word in a.lower().split():
|
|
||||||
word = word.replace(".","")
|
|
||||||
word = word.replace(",","")
|
|
||||||
word = word.replace(":","")
|
|
||||||
word = word.replace("\"","")
|
|
||||||
word = word.replace("!","")
|
|
||||||
word = word.replace("“","")
|
|
||||||
word = word.replace("‘","")
|
|
||||||
word = word.replace("*","")
|
|
||||||
word = word.replace("(","")
|
|
||||||
word = word.replace(")","")
|
|
||||||
|
|
||||||
# counting
|
|
||||||
if word not in stopwords:
|
|
||||||
if word not in wordcount:
|
|
||||||
wordcount[word] = 1
|
|
||||||
else:
|
|
||||||
wordcount[word] += 1
|
|
||||||
|
|
||||||
# print x most common words
|
|
||||||
n_print = int(100)
|
|
||||||
print("\nMost used colonial words are:")
|
|
||||||
|
|
||||||
word_counter = collections.Counter(wordcount)
|
|
||||||
for word, count in word_counter.most_common(n_print):
|
|
||||||
print(word,"—", count)
|
|
||||||
|
|
||||||
|
|
||||||
# word_counter = collections.Counter(wordcount)
|
|
||||||
# for word, count in word_counter.most_common(n_print):
|
|
||||||
# print(word,"—", count)
|
|
||||||
|
|
||||||
|
|
||||||
# colonial texts in bold
|
|
||||||
# for word in n_print:
|
|
||||||
# if word in n_print:
|
|
||||||
# wordcount.append(colored(word, 'white', 'on_red'))
|
|
||||||
# else:
|
|
||||||
# wordcount.append(t)
|
|
||||||
|
|
||||||
|
|
||||||
# print(" ".join(colored(word, 'white', 'on_red'))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# categories
|
|
||||||
|
|
||||||
# words that are inside the category Library Studies
|
|
||||||
library_studies = set(line.strip() for line in open('library_studies.txt'))
|
|
||||||
|
|
||||||
for word, count in word_counter.most_common(n_print):
|
|
||||||
if word in library_studies:
|
|
||||||
print("\nWe suggest the following categorization for this platform:\nLibrary Studies\n")
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
print("\nThese are the TikTok's colonial words.\n")
|
|
||||||
|
|
||||||
# Close the file
|
|
||||||
file.close()
|
|
Binary file not shown.
Before Width: | Height: | Size: 234 KiB |
Binary file not shown.
@ -0,0 +1,10 @@
|
|||||||
|
import nltk
|
||||||
|
|
||||||
|
file=open('faceapp.txt','r')
|
||||||
|
raw=file.read()
|
||||||
|
tokens = nltk.word_tokenize(raw)
|
||||||
|
faceapp = nltk.Text(tokens)
|
||||||
|
|
||||||
|
|
||||||
|
faceapp.concordance('services')
|
||||||
|
|
@ -0,0 +1,39 @@
|
|||||||
|
import sys
|
||||||
|
import codecs
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
# NLTK's default English stopwords
|
||||||
|
default_stopwords = set(nltk.corpus.stopwords.words('english'))
|
||||||
|
|
||||||
|
#read stop words from a file (one stopword per line, UTF-8)
|
||||||
|
stopwords_file = './stopwords.txt'
|
||||||
|
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())
|
||||||
|
|
||||||
|
all_stopwords = default_stopwords | custom_stopwords
|
||||||
|
|
||||||
|
file = open('faceapp.txt','r')
|
||||||
|
raw = file.read()
|
||||||
|
tokens = nltk.word_tokenize(raw)
|
||||||
|
faceapp = nltk.Text(tokens)
|
||||||
|
faceapp.concordance('services')
|
||||||
|
|
||||||
|
# Remove single-character tokens (mostly punctuation)
|
||||||
|
tokens = [word for word in tokens if len(word) > 1]
|
||||||
|
|
||||||
|
# Remove numbers
|
||||||
|
tokens = [word for word in tokens if not word.isnumeric()]
|
||||||
|
|
||||||
|
# Lowercase all words (default_stopwords are lowercase too)
|
||||||
|
tokens = [word.lower() for word in tokens]
|
||||||
|
|
||||||
|
# Remove stopwords
|
||||||
|
tokens = [word for word in tokens if word not in all_stopwords]
|
||||||
|
|
||||||
|
|
||||||
|
# Calculate frequency distribution
|
||||||
|
fdist = nltk.FreqDist(tokens)
|
||||||
|
|
||||||
|
# Output top 50 words
|
||||||
|
for word, frequency in fdist.most_common(10):
|
||||||
|
print(u'{};{}'.format(word, frequency))
|
@ -0,0 +1,40 @@
|
|||||||
|
import sys
|
||||||
|
import codecs
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
|
# NLTK's default English stopwords
|
||||||
|
default_stopwords = set(nltk.corpus.stopwords.words('english'))
|
||||||
|
|
||||||
|
#read stop words from a file (one stopword per line, UTF-8)
|
||||||
|
stopwords_file = './stopwords.txt'
|
||||||
|
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())
|
||||||
|
|
||||||
|
all_stopwords = default_stopwords | custom_stopwords
|
||||||
|
|
||||||
|
file = open('faceapp.txt','r')
|
||||||
|
raw = file.read()
|
||||||
|
tokens = nltk.word_tokenize(raw)
|
||||||
|
faceapp = nltk.Text(tokens)
|
||||||
|
faceapp.concordance('services')
|
||||||
|
|
||||||
|
# Remove single-character tokens (mostly punctuation)
|
||||||
|
tokens = [word for word in tokens if len(word) > 1]
|
||||||
|
|
||||||
|
# Remove numbers
|
||||||
|
tokens = [word for word in tokens if not word.isnumeric()]
|
||||||
|
|
||||||
|
# Lowercase all words (default_stopwords are lowercase too)
|
||||||
|
tokens = [word.lower() for word in tokens]
|
||||||
|
|
||||||
|
# Remove stopwords
|
||||||
|
tokens = [word for word in tokens if word not in all_stopwords]
|
||||||
|
|
||||||
|
|
||||||
|
# Calculate frequency distribution
|
||||||
|
fdist = nltk.FreqDist(tokens)
|
||||||
|
|
||||||
|
# Output top 50 words
|
||||||
|
for word, frequency in fdist.most_common(10):
|
||||||
|
print(u'{};{}'.format(word, frequency))
|
@ -1,8 +1,30 @@
|
|||||||
import nltk
|
import nltk
|
||||||
|
|
||||||
f=open('my-file.txt','r')
|
file=open('faceapp.txt','r')
|
||||||
raw=f.read()
|
raw=file.read()
|
||||||
tokens = nltk.word_tokenize(raw)
|
tokens = nltk.word_tokenize(raw)
|
||||||
faceapp = nltk.Text(tokens)
|
faceapp = nltk.Text(tokens)
|
||||||
|
|
||||||
faceapp.concordance('you')
|
|
||||||
|
# my stopwords are common words I don't want to count, like "a", "an", "the".
|
||||||
|
stopwords = set(line.strip() for line in open('stopwords.txt'))
|
||||||
|
|
||||||
|
# dictionary
|
||||||
|
wordcount = {}
|
||||||
|
|
||||||
|
# spliting words from punctuation so "book" and "book!" counts as the same word
|
||||||
|
for word in raw.lower().split():
|
||||||
|
word = word.replace(".","")
|
||||||
|
word = word.replace(",","")
|
||||||
|
word = word.replace(":","")
|
||||||
|
word = word.replace("\"","")
|
||||||
|
word = word.replace("!","")
|
||||||
|
word = word.replace("“","")
|
||||||
|
word = word.replace("‘","")
|
||||||
|
word = word.replace("*","")
|
||||||
|
word = word.replace("(","")
|
||||||
|
word = word.replace(")","")
|
||||||
|
|
||||||
|
|
||||||
|
faceapp.concordance('a')
|
||||||
|
|
||||||
|
@ -0,0 +1,72 @@
|
|||||||
|
EPISTEMIC = "epistemic" # Expresses degree of coloniality.
|
||||||
|
|
||||||
|
# 100.00 = Extreme level of coloniality
|
||||||
|
# 90.00 =
|
||||||
|
# 80.00 =
|
||||||
|
# 70.00 =
|
||||||
|
# 60.00 =
|
||||||
|
# 50.00 =
|
||||||
|
# 40.00 =
|
||||||
|
# 30.00 =
|
||||||
|
# 20.00 =
|
||||||
|
# 10.00 =
|
||||||
|
# 0.00 = Neutral level of coloniality
|
||||||
|
|
||||||
|
#MD = would, could...
|
||||||
|
#RB = adverb 'very', 'slightly'...
|
||||||
|
#VB = verb
|
||||||
|
#JJ = adjective 'big'...
|
||||||
|
#NN = noun
|
||||||
|
#CC = coordinating conjunction 'and', 'or'...
|
||||||
|
#PRP = personal pronoun 'I', 'he', 'she'...
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
epistemic_MD = { # would => could => can => should => shall => will => must
|
||||||
|
100.00: d("have", "has", "must", "need"),
|
||||||
|
90.00: d("have", "has", "must", "need"),
|
||||||
|
80.00: d("can", "ca", "may"),
|
||||||
|
70.00: d(),
|
||||||
|
60.00: d(),
|
||||||
|
50.00: d("shall", "sha"),
|
||||||
|
40.00: d("will", "'ll", "wo"),
|
||||||
|
30.00: d(),
|
||||||
|
20.00: d("can", "ca", "may"),
|
||||||
|
10.00: d("could", "dare", "might"),
|
||||||
|
0.00: d("would"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
epistemic_MD = {
|
||||||
|
100.00: d(),
|
||||||
|
90.00: d(),
|
||||||
|
80.00: d(),
|
||||||
|
70.00: d(),
|
||||||
|
60.00: d(),
|
||||||
|
50.00: d(),
|
||||||
|
40.00: d(),
|
||||||
|
30.00: d(),
|
||||||
|
20.00: d(),
|
||||||
|
10.00: d(),
|
||||||
|
0.00: d(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
epistemic_MD = {
|
||||||
|
100.00: d(),
|
||||||
|
90.00: d(),
|
||||||
|
80.00: d(),
|
||||||
|
70.00: d(),
|
||||||
|
60.00: d(),
|
||||||
|
50.00: d(),
|
||||||
|
40.00: d(),
|
||||||
|
30.00: d(),
|
||||||
|
20.00: d(),
|
||||||
|
10.00: d(),
|
||||||
|
0.00: d(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,10 @@
|
|||||||
|
4. User Content
|
||||||
|
Our Services may allow you and other users to create, post, store and share content, including photos, videos, messages, text, software and other materials (collectively, “User Content”). User Content does not include user-generated filters. Subject to this Agreement and the Privacy Policy, you retain all rights in and to your User Content, as between you and FaceApp. Further, FaceApp does not claim ownership of any User Content that you post on or through the Services. You grant FaceApp a nonexclusive, royalty-free, worldwide, fully paid license to use, reproduce, modify, adapt, create derivative works from, distribute, perform and display your User Content during the term of this Agreement solely to provide you with the Services.
|
||||||
|
|
||||||
|
You acknowledge that some of the Services are supported by advertising revenue and may display advertisements and promotions, and you hereby agree that FaceApp may place such advertising and promotions on the Services or on, about, or in conjunction with your User Content. The manner, mode and extent of such advertising and promotions are subject to change without specific notice to you. You acknowledge that we may not always identify paid services, sponsored content, or commercial communications as such.
|
||||||
|
|
||||||
|
You represent and warrant that: (i) you own or otherwise have the right to use the User Content modified by you on or through the Services in accordance with the rights and licenses set forth in this Agreement; (ii) you agree to pay for all royalties, fees, and any other monies owed by reason of User Content you stylize on or through the Services; and (iii) you have the legal right and capacity to enter into this Agreement in your jurisdiction.
|
||||||
|
|
||||||
|
You may not create, post, store or share any User Content that violates this Agreement or for which you do not have all the rights necessary to grant us the license described above. Although we have no obligation to screen, edit or monitor User Content, we may delete or remove User Content at any time and for any reason.
|
||||||
|
|
||||||
|
FaceApp is not a backup service and you agree that you will not rely on the Services for the purposes of User Content backup or storage. FaceApp will not be liable to you for any modification, suspension, or discontinuation of the Services, or the loss of any User Content.
|
Loading…
Reference in New Issue