made corpus stroke / no entries into integers, do interesting remix operation with integers.

3 years ago · 0ef587164a
parent 4779ee09c0
commit 0ef587164a
5 changed files with 113075 additions and 9 deletions
--- a/corpus/corpus_cn.json
+++ b/corpus/corpus_cn.json
--- a/corpus_cn.json
+++ b/corpus_cn.json
@ -0,0 +1,26 @@
+[
+    {
+        "glyph": "hong",
+        "dept-no": "silk-1",
+        "radical": "silk",
+        "no": 3
+    },
+    {
+        "glyph": "jiao",
+        "dept-no": "silk-2",
+        "radical": "silk",
+        "no": 4
+    },
+    {
+        "glyph": "zhu",
+        "dept-no": "silk-3",
+        "radical": "silk",
+        "no": 5
+    },
+    {
+        "glyph": "du",
+        "dept-no": "earth-1",
+        "radical": "earth",
+        "no": 6
+    }
+]
--- a/large_corpus.py
+++ b/large_corpus.py
@ -1,6 +1,44 @@
 '''
+Development Notes
+the program temporarily works with word.json to test the viability of
+working with large corpus organized as json. 

 '''
-# choose a corpus to work with
-# look at corpus structure 
-# browse methods to parse large quantity of corpus 
+import json
+from pprint import pprint
+
+def parse_json(filename):
+    data = json.load(open(filename,'r'))
+    return data
+
+def save_json(data):
+    with open('corpus/corpus_cn.json','w', encoding='utf-8') as w_file:
+        json.dump(data,w_file, indent=4, ensure_ascii = False)
+
+def intify(data):
+    for i in data:
+        print("converting to int")
+        i["strokes"] = int(i["strokes"]) 
+        print(type(i["strokes"]))
+    return data
+
+# data sciency methods are not all that interesting, cautiously stay away. 
+    # think of what do people do with a dictionary
+    # look for a radical; return results with that radical; like Bok.
+    # some general usages of using dictionary   
+
+# def remix_lookup():
+
+# def radical_lookup():
+
+# def stroke_lookup():
+
+# save corpus as separate file to work with 
+
+
+# browse methods to parse large quantity of corpus 
+
+if __name__== "__main__":
+    parsed_json = parse_json(filename="corpus/corpus_cn.json")
+    intified_data = intify(parsed_json)
+    save_json(data = intified_data)
--- a/mini_corpus.py
+++ b/mini_corpus.py
@ -39,15 +39,20 @@ def parse_json(filename):
    data = json.load(open(filename,'r'))
    return data

+def intify(data):
+    for i in data:
+        i["no"] = int(i["no"]) 
+    return data
+
 def in_msg(data):
    message_i = []
    message_full_i = []

    for i in data:
-        if i["no"] == "2":
+        if i["no"] == 2:
            message_i.append(i["glyph"])
            message_full_i.append(i)
-        if i["no"] == "3":
+        if i["no"] == 3:
            message_i.append(i["glyph"])
            message_full_i.append(i)

@ -58,12 +63,11 @@ def in_msg(data):
 def disrupt(data):
    noise = np.random.randint(1,3)
    for i in data:
-        i["no"] = int(i["no"]) 
        i["no"] += noise
    return data

 def save_json(data):
-    with open('noised.json','w', encoding='utf-8') as w_file:
+    with open('corpus_cn.json','w', encoding='utf-8') as w_file:
        json.dump(data,w_file, indent=4, ensure_ascii = False)

 def out_msg(noise_data):
@ -94,8 +98,9 @@ def out_msg(noise_data):

 if __name__ == "__main__":
    parsed_json = parse_json(filename = "seed.json")
-    in_msg(data = parsed_json)
-    disrupted_data = disrupt(data = parsed_json)
+    intified_data = intify(data = parsed_json)
+    in_msg(data = intified_data)
+    disrupted_data = disrupt(data = intified_data)
    save_json(data = disrupted_data)
    parsed_noise_json = parse_json(filename = "noised.json")
    out_msg(noise_data = parsed_noise_json)
--- a/todo.md
+++ b/todo.md
@ -1,2 +1,3 @@
 work on disruptive system text.
 cat >> to append new line!
+convert key:value value to integers