new readme assignment
parent
2f55e37ea6
commit
286c39168e
@ -0,0 +1,17 @@
|
||||
import numpy as np
|
||||
yong_grid = [[0,0,0,0,1,1,0,0,0,0],
|
||||
[0,0,1,1,1,1,0,0,1,0],
|
||||
[0,0,0,0,0,1,0,1,0,0],
|
||||
[0,0,0,0,0,1,1,0,0,0],
|
||||
[0,1,1,1,1,1,1,0,0,0],
|
||||
[0,0,0,1,0,1,1,0,0,0],
|
||||
[0,0,1,0,0,1,0,1,0,0],
|
||||
[0,1,0,0,0,1,0,0,1,0],
|
||||
[0,0,0,1,0,1,0,0,0,0],
|
||||
[0,0,0,0,1,1,0,0,0,0]]
|
||||
|
||||
print(np.matrix(yong_grid))
|
||||
|
||||
from matplotlib import pyplot as plt
|
||||
im = plt.imshow(yong_grid, cmap="copper_r")
|
||||
plt.show()
|
@ -1,38 +0,0 @@
|
||||
bag of words
|
||||
|
||||
irreversability / stripping away the writer's process...
|
||||
|
||||
book of words
|
||||
|
||||
arbitrariness of the digital
|
||||
link of language to economy of representation
|
||||
separation from the body (pronouneable)
|
||||
|
||||
illusion of a universal language
|
||||
|
||||
"unadulterated data"
|
||||
|
||||
what if the separation is not so easy to make
|
||||
|
||||
Un/Structured
|
||||
|
||||
Brin and Page's RESOURCES.
|
||||
|
||||
Web RESOURCES
|
||||
|
||||
ECONOMIES / Trade offs
|
||||
|
||||
|
||||
BIG FINISH...
|
||||
|
||||
In announcing Google's impending data center in Mons, Belgian prime minister Di Rupo invoked the link between the history of the mining industry in the region and the present and future interest in "data mining" as practiced by Google.
|
||||
|
||||
Whether bales of cotton, barrels of oil, or bags of words, what links these processes is the way in which the notion of "raw material" obscures the labor and power structures employed to secure them. "Raw" is always relative: "purity" depends on processes of "refinement" that typically carry social/ecological impact.
|
||||
|
||||
Stripping language of order is an act of "disembodiment", detaching it from the acts of writing and reading. The shift from (human) reading to machine reading involves a shift of responsibility from the individual human body to the obscured responsibilities and seemingly inevitable forces of the "machine", be it the machine of a market or the machine of an algorithm.
|
||||
|
||||
The (computer scientists) view of textual content as "unstructured", be it in a webpage or the OCR scanned pages of a book, reflect a negligence to the processes and labor of writing, editing, design, layout, typesetting, and eventually publishing, collecting and cataloging [11].
|
||||
|
||||
"Unstructured" to the computer scientist then, means non-conformant to particular forms of machine reading. "Structuring" then is a social process by which particular (additional) conventions are upon and employed. The computer scientist oftens views a text through the eyes of their particular reading algorithm, and in the process (voluntarily) blinds themselves to the work practices which have produced and maintain these "resources".
|
||||
|
||||
Berners-Lee, in chastising his audience of web publishers to not only publish online, but to release "unadulterated" data belies a lack of imagination in considering how language is itself structured and a blindness to the need for more than additional technical standards to connect to existing publishing practices.
|
@ -1,14 +1,22 @@
|
||||
# Graph
|
||||
|
||||
## Text
|
||||
Dot matrix printer -> ->
|
||||
|
||||
Yong (永) Brainsim
|
||||
|
||||
Tribute to BRAINSIM, a neural network on the Commodore 64
|
||||
|
||||
My discussion (with @nickm) on BRAINSIM can be found in the archive of the Critical Code Studies Working Group 2022: https://wg.criticalcodestudies.com/index.php?p=/discussion/117/brainsim-neural-network-on-a-commodore-64-2022-code-critique#latest
|
||||
|
||||
|
||||
|
||||
|
||||
## Prototype
|
||||
simple as possible; add in noise to reflect word is graph.
|
||||
![](8b.gif)
|
||||
![](fig_12_3.png)
|
||||
![](note_jun_18_00.jpg)
|
||||
![](note_jun_18_01.jpg)
|
||||
![](note_jun_18_02.jpg)
|
||||
|
||||
# Mathematical analysis with text
|
||||
## Bag of Words
|
||||
![](fig_12_4.png)
|
||||
|
||||
## Term Frequency Inverse Document Frequency (TF-IDF)
|
||||
Heuristically, it reflects the role of a given word in relation to a given corpus. Used in information retrieval.
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -1,26 +0,0 @@
|
||||
[
|
||||
{
|
||||
"glyph": "hong",
|
||||
"dept-no": "silk-1",
|
||||
"radical": "silk",
|
||||
"no": 3
|
||||
},
|
||||
{
|
||||
"glyph": "jiao",
|
||||
"dept-no": "silk-2",
|
||||
"radical": "silk",
|
||||
"no": 4
|
||||
},
|
||||
{
|
||||
"glyph": "zhu",
|
||||
"dept-no": "silk-3",
|
||||
"radical": "silk",
|
||||
"no": 5
|
||||
},
|
||||
{
|
||||
"glyph": "du",
|
||||
"dept-no": "earth-1",
|
||||
"radical": "earth",
|
||||
"no": 6
|
||||
}
|
||||
]
|
Binary file not shown.
Before Width: | Height: | Size: 7.9 KiB |
@ -1,44 +0,0 @@
|
||||
'''
|
||||
Development Notes
|
||||
the program temporarily works with word.json to test the viability of
|
||||
working with large corpus organized as json.
|
||||
|
||||
'''
|
||||
import json
|
||||
from pprint import pprint
|
||||
|
||||
def parse_json(filename):
|
||||
data = json.load(open(filename,'r'))
|
||||
return data
|
||||
|
||||
def save_json(data):
|
||||
with open('corpus/corpus_cn.json','w', encoding='utf-8') as w_file:
|
||||
json.dump(data,w_file, indent=4, ensure_ascii = False)
|
||||
|
||||
def intify(data):
|
||||
for i in data:
|
||||
print("converting to int")
|
||||
i["strokes"] = int(i["strokes"])
|
||||
print(type(i["strokes"]))
|
||||
return data
|
||||
|
||||
# data sciency methods are not all that interesting, cautiously stay away.
|
||||
# think of what do people do with a dictionary
|
||||
# look for a radical; return results with that radical; like Bok.
|
||||
# some general usages of using dictionary
|
||||
|
||||
# def remix_lookup():
|
||||
|
||||
# def radical_lookup():
|
||||
|
||||
# def stroke_lookup():
|
||||
|
||||
# save corpus as separate file to work with
|
||||
|
||||
|
||||
# browse methods to parse large quantity of corpus
|
||||
|
||||
if __name__== "__main__":
|
||||
parsed_json = parse_json(filename="corpus/corpus_cn.json")
|
||||
intified_data = intify(parsed_json)
|
||||
save_json(data = intified_data)
|
@ -1,116 +0,0 @@
|
||||
'''
|
||||
Development Notes
|
||||
JSON is a common format used to represent structured text.
|
||||
|
||||
The essence of the program is to introduce noise to disrupt the
|
||||
mapping relation present in the dictionary.
|
||||
|
||||
Rules are also present in machine learning.
|
||||
|
||||
The mapping rule is disrupted, when I query the dictionary again,
|
||||
the message is disrupted.
|
||||
|
||||
The rule is disrupted by linear arithemetic operation,
|
||||
are there more non-linear and nuanced rules?
|
||||
|
||||
Prototype to translate a system to text into dictionary
|
||||
as a type of structured text.
|
||||
|
||||
str int conversion is important to debug the program
|
||||
|
||||
I am still hostile to the concept and etymology of noise, rename
|
||||
the concept into something else.
|
||||
|
||||
'''
|
||||
import numpy as np
|
||||
import json
|
||||
|
||||
# 1 <= key <= 3, silk; 4 <= key <= 6, earth; 7 <= key <= 9, water
|
||||
|
||||
# perform message decryption process via this mini corpus
|
||||
# "no" field is similar to ascii/morse code/unicode coding protocols
|
||||
# original message identified by "no" field, no 2 & 3
|
||||
# first wrd
|
||||
|
||||
# disrupted message idenfified by "no" field
|
||||
|
||||
|
||||
def parse_json(filename):
|
||||
data = json.load(open(filename,'r'))
|
||||
return data
|
||||
|
||||
def intify(data):
|
||||
for i in data:
|
||||
i["no"] = int(i["no"])
|
||||
return data
|
||||
|
||||
def in_msg(data):
|
||||
message_i = []
|
||||
message_full_i = []
|
||||
|
||||
for i in data:
|
||||
if i["no"] == 2:
|
||||
message_i.append(i["glyph"])
|
||||
message_full_i.append(i)
|
||||
if i["no"] == 3:
|
||||
message_i.append(i["glyph"])
|
||||
message_full_i.append(i)
|
||||
|
||||
print("message prior to disruption contains: ")
|
||||
for s in message_i:
|
||||
print(s)
|
||||
|
||||
def disrupt(data):
|
||||
noise = np.random.randint(1,3)
|
||||
for i in data:
|
||||
i["no"] += noise
|
||||
return data
|
||||
|
||||
def save_json(data):
|
||||
with open('corpus_cn.json','w', encoding='utf-8') as w_file:
|
||||
json.dump(data,w_file, indent=4, ensure_ascii = False)
|
||||
|
||||
def out_msg(noise_data):
|
||||
message_o = []
|
||||
message_full_o = []
|
||||
|
||||
for i in noise_data:
|
||||
#comparing integers, the noise_data no fields are previously
|
||||
#dumped as integers
|
||||
if i["no"] == 2:
|
||||
message_o.append(i["glyph"])
|
||||
message_full_o.append(i)
|
||||
if i["no"] == 3:
|
||||
message_o.append(i["glyph"])
|
||||
message_full_o.append(i)
|
||||
|
||||
print("message after disruption contains: ")
|
||||
|
||||
for s in message_o:
|
||||
print(s)
|
||||
|
||||
# at this point the interferences are somewhat apparent
|
||||
# how can i present the interference to be more apparent?
|
||||
# do i need to work with a really large corpus to make it apparent?
|
||||
# todo
|
||||
# input chinese blocks in here
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parsed_json = parse_json(filename = "seed.json")
|
||||
intified_data = intify(data = parsed_json)
|
||||
in_msg(data = intified_data)
|
||||
disrupted_data = disrupt(data = intified_data)
|
||||
save_json(data = disrupted_data)
|
||||
parsed_noise_json = parse_json(filename = "noised.json")
|
||||
out_msg(noise_data = parsed_noise_json)
|
||||
|
||||
# test with a large corpus in separate program
|
||||
|
||||
# try with corpuses of different language
|
||||
|
||||
# try a chinese dictionary and a latin dictionary
|
||||
|
||||
# and any other types of dictionary structures, remix!
|
||||
|
||||
# main section looks really ugly
|
@ -1,26 +0,0 @@
|
||||
[
|
||||
{
|
||||
"glyph": "hong",
|
||||
"dept-no": "silk-1",
|
||||
"radical": "silk",
|
||||
"no": 3
|
||||
},
|
||||
{
|
||||
"glyph": "jiao",
|
||||
"dept-no": "silk-2",
|
||||
"radical": "silk",
|
||||
"no": 4
|
||||
},
|
||||
{
|
||||
"glyph": "zhu",
|
||||
"dept-no": "silk-3",
|
||||
"radical": "silk",
|
||||
"no": 5
|
||||
},
|
||||
{
|
||||
"glyph": "du",
|
||||
"dept-no": "earth-1",
|
||||
"radical": "earth",
|
||||
"no": 6
|
||||
}
|
||||
]
|
Binary file not shown.
Before Width: | Height: | Size: 82 KiB |
Binary file not shown.
Before Width: | Height: | Size: 75 KiB |
Binary file not shown.
Before Width: | Height: | Size: 45 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,5 +0,0 @@
|
||||
#!/bin/bash
|
||||
# run python script
|
||||
python3 mini_corpus.py
|
||||
# cat result
|
||||
cat new.json
|
@ -1,30 +0,0 @@
|
||||
[
|
||||
{
|
||||
"glyph": "hong",
|
||||
"dept-no": "silk-1",
|
||||
"radical": "silk",
|
||||
"no": "1"
|
||||
},
|
||||
|
||||
{
|
||||
"glyph": "jiao",
|
||||
"dept-no": "silk-2",
|
||||
"radical": "silk",
|
||||
"no": "2"
|
||||
},
|
||||
|
||||
{
|
||||
"glyph": "zhu",
|
||||
"dept-no": "silk-3",
|
||||
"radical": "silk",
|
||||
"no": "3"
|
||||
},
|
||||
|
||||
{
|
||||
"glyph": "du",
|
||||
"dept-no": "earth-1",
|
||||
"radical": "earth",
|
||||
"no": "4"
|
||||
}
|
||||
]
|
||||
|
@ -1,191 +0,0 @@
|
||||
import os, json, re
|
||||
from math import log, exp
|
||||
from flask import Markup
|
||||
|
||||
from nltk import sent_tokenize
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
|
||||
|
||||
import pprint
|
||||
pp = pprint.PrettyPrinter(indent=4)
|
||||
|
||||
def tfidf(query, words, corpus):
|
||||
# Term Frequency
|
||||
tf_count = 0
|
||||
for word in words:
|
||||
if query == word:
|
||||
tf_count += 1
|
||||
tf = tf_count/len(words)
|
||||
# print('count:', tf_count)
|
||||
# print('total:', len(words))
|
||||
# print('TF - count/total', tf_count/len(words))
|
||||
|
||||
# Inverse Document Frequency
|
||||
idf_count = 0
|
||||
for words in corpus:
|
||||
if query in words:
|
||||
idf_count += 1
|
||||
# print('count:', idf_count)
|
||||
idf = log(len(corpus)/idf_count)
|
||||
# print('documents:', len(corpus))
|
||||
# print('documents/count', len(corpus)/idf_count)
|
||||
# print('IDF - log(documents/count)', log(len(corpus)/idf_count))
|
||||
|
||||
tfidf_value = tf * idf
|
||||
# print('TF-IDF:', tfidf_value)
|
||||
|
||||
return tf_count, tf_count, tfidf_value
|
||||
|
||||
def load_text_files():
|
||||
files = []
|
||||
corpus = []
|
||||
sentences = {}
|
||||
dir = 'txt'
|
||||
|
||||
for f in sorted(os.listdir(dir)):
|
||||
# manifesto = f.replace('.txt','')
|
||||
manifesto = f
|
||||
lines = open(dir+'/'+f, "r").read() # list of lines in .txt file
|
||||
words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation
|
||||
corpus.append(words) # all words of one manifesto, in reading order
|
||||
s = sent_tokenize(lines)
|
||||
sentences[manifesto] = s
|
||||
files.append(manifesto) # list of filenames
|
||||
|
||||
print('*txt files loaded*')
|
||||
return files, corpus, sentences
|
||||
|
||||
def create_index():
|
||||
files, corpus, sentences = load_text_files()
|
||||
index = {}
|
||||
|
||||
# index = {
|
||||
# Fem manifesto : {
|
||||
# 'words' : {
|
||||
# 'aap': 39.2,
|
||||
# 'beer': 20.456,
|
||||
# 'citroen': 3.21
|
||||
# }
|
||||
# 'tf' : {
|
||||
# 'aap': 4,
|
||||
# 'beer': 6,
|
||||
# 'citroen': 2
|
||||
# }
|
||||
# 'idf' : {
|
||||
# 'aap': 4,
|
||||
# 'beer': 6,
|
||||
# 'citroen': 2
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
for i, words in enumerate(corpus):
|
||||
manifesto = files[i]
|
||||
index[manifesto] = {}
|
||||
index[manifesto]['sentences'] = sentences[manifesto]
|
||||
for word in words:
|
||||
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
|
||||
if 'words' not in index[manifesto]:
|
||||
index[manifesto]['words'] = {}
|
||||
index[manifesto]['words'][word] = tfidf_value
|
||||
if 'tf' not in index[manifesto]:
|
||||
index[manifesto]['tf'] = {}
|
||||
index[manifesto]['tf'][word] = tf_count
|
||||
|
||||
with open('index.json','w+') as out:
|
||||
out.write(json.dumps(index, indent=4, sort_keys=True))
|
||||
out.close()
|
||||
print('*index created*')
|
||||
|
||||
def load_index():
|
||||
f = open('index.json').read()
|
||||
index = json.loads(f)
|
||||
return index
|
||||
|
||||
def request_results(query):
|
||||
query = query.strip()
|
||||
f = open('index.json').read()
|
||||
index = json.loads(f)
|
||||
files = [manifesto for manifesto, _ in index.items()]
|
||||
|
||||
results = {}
|
||||
|
||||
# results = {
|
||||
# 0 : {
|
||||
# 'name' : 'Fem_manifesto',
|
||||
# 'value' : 0.00041,
|
||||
# 'sentences' : [
|
||||
# 'This is a first sentence.',
|
||||
# 'This is a second sentence.',
|
||||
# 'This is a third sentence.'
|
||||
# ]
|
||||
# }
|
||||
# }
|
||||
|
||||
# make a list of manifesto's that use the query word
|
||||
result_matches = []
|
||||
for manifesto, _ in index.items():
|
||||
for word, value in index[manifesto]['words'].items():
|
||||
if query == word:
|
||||
tf = index[manifesto]['tf'][word]
|
||||
total = len(index[manifesto]['words'])
|
||||
sentences = index[manifesto]['sentences']
|
||||
result_matches.append([value, manifesto, tf, total, sentences])
|
||||
|
||||
result_matches.sort(reverse=True)
|
||||
for x, result in enumerate(result_matches):
|
||||
results[x] = {}
|
||||
results[x]['tfidf'] = result[0]
|
||||
results[x]['name'] = result[1]
|
||||
results[x]['tf'] = result[2]
|
||||
results[x]['total'] = result[3]
|
||||
results[x]['sentences'] = result[4]
|
||||
|
||||
pp.pprint(results)
|
||||
|
||||
# make a list of sentences that contain the query word
|
||||
# and shape results object
|
||||
for x, manifesto in results.items():
|
||||
value = manifesto['tfidf'] * 50000
|
||||
result_sentences = []
|
||||
# count = 0
|
||||
for s in manifesto['sentences']:
|
||||
done = 'no'
|
||||
for word in tokenizer.tokenize(s):
|
||||
if word == query:
|
||||
# if count < 3: # set to include a max 3 results/manifesto in the results list
|
||||
# count += 1
|
||||
if done is not 'yes':
|
||||
sentence = re.sub(r'[ .,;/\\*]'+query+r'[ ,.;/\\*]', '<strong style="font-size:{}%;"> {} </strong>'.format(100 + value, query), s)
|
||||
html = Markup(sentence)
|
||||
# if count == 3:
|
||||
# html = html + Markup('<div id="more">(...)<sup>*</sup></div>')
|
||||
result_sentences.append(html)
|
||||
done = 'yes'
|
||||
results[x]['sentences'] = result_sentences
|
||||
|
||||
print('*results returned*')
|
||||
return results, files
|
||||
|
||||
def request_ordered():
|
||||
f = open('index.json').read()
|
||||
index = json.loads(f)
|
||||
files = [manifesto for manifesto, _ in index.items()]
|
||||
results = {}
|
||||
for manifesto, _ in index.items():
|
||||
words = sorted([[value, word] for word, value in index[manifesto]['words'].items()], reverse=True)
|
||||
results[manifesto] = words
|
||||
return results, files
|
||||
|
||||
def request_ordered_all():
|
||||
f = open('index.json').read()
|
||||
index = json.loads(f)
|
||||
files = [manifesto for manifesto, _ in index.items()]
|
||||
results = []
|
||||
i = 0
|
||||
for manifesto, _ in index.items():
|
||||
for word, value in index[manifesto]['words'].items():
|
||||
results.append([value, word, i])
|
||||
i += 1
|
||||
results = sorted(results)
|
||||
return results, files
|
@ -1,6 +0,0 @@
|
||||
work on disruptive system text.
|
||||
cat >> to append new line!
|
||||
convert key:value value to integers
|
||||
todo 1: []tf-idf []markov []concord []conway
|
||||
todo 2: txt2img
|
||||
todo 3: disruption sort
|
@ -1,7 +0,0 @@
|
||||
Title: Yong Brainsim Randomization
|
||||
Date: 2022-04-04
|
||||
Category: computer_archaeology
|
||||
Tags: computer_archaeology_cn, machine_learning
|
||||
Slug: yong-brainsim-randomization
|
||||
Authors: onebigear
|
||||
Summary: randomized matrix representation of the “yong” 永 character.
|
Loading…
Reference in New Issue