structure_graphs/mini_corpus.py

'''
Development Notes
JSON is a common format used to represent structured text.

The essence of the program is to introduce noise to disrupt the
mapping relation present in the dictionary.

Rules are also present in machine learning.

The mapping rule is disrupted, when I query the dictionary again,
the message is disrupted.

The rule is disrupted by linear arithemetic operation,
are there more non-linear and nuanced rules?

Prototype to translate a system to text into dictionary
as a type of structured text.

str int conversion is important to debug the program

I am still hostile to the concept and etymology of noise, rename
the concept into something else.

'''
import numpy as np
import json

# 1 <= key <= 3, silk; 4 <= key <= 6, earth; 7 <= key <= 9, water

# perform message decryption process via this mini corpus
# "no" field is similar to ascii/morse code/unicode coding protocols
# original message identified by "no" field, no 2 & 3
# first wrd

# disrupted message idenfified by "no" field


def parse_json(filename):
    data = json.load(open(filename,'r'))
    return data

def intify(data):
    for i in data:
        i["no"] = int(i["no"])
    return data

def in_msg(data):
    message_i = []
    message_full_i = []

    for i in data:
        if i["no"] == 2:
            message_i.append(i["glyph"])
            message_full_i.append(i)
        if i["no"] == 3:
            message_i.append(i["glyph"])
            message_full_i.append(i)

    print("message prior to disruption contains: ")
    for s in message_i:
        print(s)

def disrupt(data):
    noise = np.random.randint(1,3)
    for i in data:
        i["no"] += noise
    return data

def save_json(data):
    with open('corpus_cn.json','w', encoding='utf-8') as w_file:
        json.dump(data,w_file, indent=4, ensure_ascii = False)

def out_msg(noise_data):
    message_o = []
    message_full_o = []

    for i in noise_data:
        #comparing integers, the noise_data no fields are previously
        #dumped as integers
        if i["no"] == 2:
            message_o.append(i["glyph"])
            message_full_o.append(i)
        if i["no"] == 3:
            message_o.append(i["glyph"])
            message_full_o.append(i)

    print("message after disruption contains: ")

    for s in message_o:
        print(s)

# at this point the interferences are somewhat apparent
# how can i present the interference to be more apparent?
# do i need to work with a really large corpus to make it apparent?
    # todo
# input chinese blocks in here


if __name__ == "__main__":
    parsed_json = parse_json(filename = "seed.json")
    intified_data = intify(data = parsed_json)
    in_msg(data = intified_data)
    disrupted_data = disrupt(data = intified_data)
    save_json(data = disrupted_data)
    parsed_noise_json = parse_json(filename = "noised.json")
    out_msg(noise_data = parsed_noise_json)

# test with a large corpus in separate program

# try with corpuses of different language

# try a chinese dictionary and a latin dictionary

# and any other types of dictionary structures, remix!

# main section looks really ugly