made corpus stroke / no entries into integers, do interesting remix operation with integers.

main
onebigear 3 years ago
parent 4779ee09c0
commit 0ef587164a

File diff suppressed because it is too large Load Diff

@ -0,0 +1,26 @@
[
{
"glyph": "hong",
"dept-no": "silk-1",
"radical": "silk",
"no": 3
},
{
"glyph": "jiao",
"dept-no": "silk-2",
"radical": "silk",
"no": 4
},
{
"glyph": "zhu",
"dept-no": "silk-3",
"radical": "silk",
"no": 5
},
{
"glyph": "du",
"dept-no": "earth-1",
"radical": "earth",
"no": 6
}
]

@ -1,6 +1,44 @@
'''
Development Notes
the program temporarily works with word.json to test the viability of
working with large corpus organized as json.
'''
# choose a corpus to work with
# look at corpus structure
# browse methods to parse large quantity of corpus
import json
from pprint import pprint
def parse_json(filename):
data = json.load(open(filename,'r'))
return data
def save_json(data):
with open('corpus/corpus_cn.json','w', encoding='utf-8') as w_file:
json.dump(data,w_file, indent=4, ensure_ascii = False)
def intify(data):
for i in data:
print("converting to int")
i["strokes"] = int(i["strokes"])
print(type(i["strokes"]))
return data
# data sciency methods are not all that interesting, cautiously stay away.
# think of what do people do with a dictionary
# look for a radical; return results with that radical; like Bok.
# some general usages of using dictionary
# def remix_lookup():
# def radical_lookup():
# def stroke_lookup():
# save corpus as separate file to work with
# browse methods to parse large quantity of corpus
if __name__== "__main__":
parsed_json = parse_json(filename="corpus/corpus_cn.json")
intified_data = intify(parsed_json)
save_json(data = intified_data)

@ -39,15 +39,20 @@ def parse_json(filename):
data = json.load(open(filename,'r'))
return data
def intify(data):
for i in data:
i["no"] = int(i["no"])
return data
def in_msg(data):
message_i = []
message_full_i = []
for i in data:
if i["no"] == "2":
if i["no"] == 2:
message_i.append(i["glyph"])
message_full_i.append(i)
if i["no"] == "3":
if i["no"] == 3:
message_i.append(i["glyph"])
message_full_i.append(i)
@ -58,12 +63,11 @@ def in_msg(data):
def disrupt(data):
noise = np.random.randint(1,3)
for i in data:
i["no"] = int(i["no"])
i["no"] += noise
return data
def save_json(data):
with open('noised.json','w', encoding='utf-8') as w_file:
with open('corpus_cn.json','w', encoding='utf-8') as w_file:
json.dump(data,w_file, indent=4, ensure_ascii = False)
def out_msg(noise_data):
@ -94,8 +98,9 @@ def out_msg(noise_data):
if __name__ == "__main__":
parsed_json = parse_json(filename = "seed.json")
in_msg(data = parsed_json)
disrupted_data = disrupt(data = parsed_json)
intified_data = intify(data = parsed_json)
in_msg(data = intified_data)
disrupted_data = disrupt(data = intified_data)
save_json(data = disrupted_data)
parsed_noise_json = parse_json(filename = "noised.json")
out_msg(noise_data = parsed_noise_json)

@ -1,2 +1,3 @@
work on disruptive system text.
cat >> to append new line!
convert key:value value to integers

Loading…
Cancel
Save