made corpus stroke / no entries into integers, do interesting remix operation with integers.

2 years ago · 0ef587164a
parent 4779ee09c0
commit 0ef587164a
5 changed files with 113075 additions and 9 deletions
--- a/corpus/corpus_cn.json
+++ b/corpus/corpus_cn.json
--- a/corpus_cn.json
+++ b/corpus_cn.json
@ -0,0 +1,26 @@
 [
    {
        "glyph": "hong",
        "dept-no": "silk-1",
        "radical": "silk",
        "no": 3
    },
    {
        "glyph": "jiao",
        "dept-no": "silk-2",
        "radical": "silk",
        "no": 4
    },
    {
        "glyph": "zhu",
        "dept-no": "silk-3",
        "radical": "silk",
        "no": 5
    },
    {
        "glyph": "du",
        "dept-no": "earth-1",
        "radical": "earth",
        "no": 6
    }
 ]
--- a/large_corpus.py
+++ b/large_corpus.py
@ -1,6 +1,44 @@
 '''
 Development Notes
 the program temporarily works with word.json to test the viability of
 working with large corpus organized as json. 
 '''
-# choose a corpus to work with
+import json
-# look at corpus structure 
+from pprint import pprint
-# browse methods to parse large quantity of corpus 
+
 def parse_json(filename):
    data = json.load(open(filename,'r'))
    return data
 def save_json(data):
    with open('corpus/corpus_cn.json','w', encoding='utf-8') as w_file:
        json.dump(data,w_file, indent=4, ensure_ascii = False)
 def intify(data):
    for i in data:
        print("converting to int")
        i["strokes"] = int(i["strokes"]) 
        print(type(i["strokes"]))
    return data
 # data sciency methods are not all that interesting, cautiously stay away. 
    # think of what do people do with a dictionary
    # look for a radical; return results with that radical; like Bok.
    # some general usages of using dictionary   
 # def remix_lookup():
 # def radical_lookup():
 # def stroke_lookup():
 # save corpus as separate file to work with 
 # browse methods to parse large quantity of corpus 
 if __name__== "__main__":
    parsed_json = parse_json(filename="corpus/corpus_cn.json")
    intified_data = intify(parsed_json)
    save_json(data = intified_data)
--- a/mini_corpus.py
+++ b/mini_corpus.py
@ -39,15 +39,20 @@ def parse_json(filename):
    data = json.load(open(filename,'r'))
    return data
 def intify(data):
    for i in data:
        i["no"] = int(i["no"]) 
    return data
 def in_msg(data):
    message_i = []
    message_full_i = []
    for i in data:
-        if i["no"] == "2":
+        if i["no"] == 2:
            message_i.append(i["glyph"])
            message_full_i.append(i)
-        if i["no"] == "3":
+        if i["no"] == 3:
            message_i.append(i["glyph"])
            message_full_i.append(i)
@ -58,12 +63,11 @@ def in_msg(data):
 def disrupt(data):
    noise = np.random.randint(1,3)
    for i in data:
        i["no"] = int(i["no"]) 
        i["no"] += noise
    return data
 def save_json(data):
-    with open('noised.json','w', encoding='utf-8') as w_file:
+    with open('corpus_cn.json','w', encoding='utf-8') as w_file:
        json.dump(data,w_file, indent=4, ensure_ascii = False)
 def out_msg(noise_data):
@ -94,8 +98,9 @@ def out_msg(noise_data):
 if __name__ == "__main__":
    parsed_json = parse_json(filename = "seed.json")
-    in_msg(data = parsed_json)
+    intified_data = intify(data = parsed_json)
-    disrupted_data = disrupt(data = parsed_json)
+    in_msg(data = intified_data)
    disrupted_data = disrupt(data = intified_data)
    save_json(data = disrupted_data)
    parsed_noise_json = parse_json(filename = "noised.json")
    out_msg(noise_data = parsed_noise_json)
--- a/todo.md
+++ b/todo.md
@ -1,2 +1,3 @@
 work on disruptive system text.
 cat >> to append new line!
 convert key:value value to integers