made corpus stroke / no entries into integers, do interesting remix operation with integers.

main
onebigear 2 years ago
parent 4779ee09c0
commit 0ef587164a

File diff suppressed because it is too large Load Diff

@ -0,0 +1,26 @@
[
{
"glyph": "hong",
"dept-no": "silk-1",
"radical": "silk",
"no": 3
},
{
"glyph": "jiao",
"dept-no": "silk-2",
"radical": "silk",
"no": 4
},
{
"glyph": "zhu",
"dept-no": "silk-3",
"radical": "silk",
"no": 5
},
{
"glyph": "du",
"dept-no": "earth-1",
"radical": "earth",
"no": 6
}
]

@ -1,6 +1,44 @@
''' '''
Development Notes
the program temporarily works with word.json to test the viability of
working with large corpus organized as json.
''' '''
# choose a corpus to work with import json
# look at corpus structure from pprint import pprint
def parse_json(filename):
data = json.load(open(filename,'r'))
return data
def save_json(data):
with open('corpus/corpus_cn.json','w', encoding='utf-8') as w_file:
json.dump(data,w_file, indent=4, ensure_ascii = False)
def intify(data):
for i in data:
print("converting to int")
i["strokes"] = int(i["strokes"])
print(type(i["strokes"]))
return data
# data sciency methods are not all that interesting, cautiously stay away.
# think of what do people do with a dictionary
# look for a radical; return results with that radical; like Bok.
# some general usages of using dictionary
# def remix_lookup():
# def radical_lookup():
# def stroke_lookup():
# save corpus as separate file to work with
# browse methods to parse large quantity of corpus # browse methods to parse large quantity of corpus
if __name__== "__main__":
parsed_json = parse_json(filename="corpus/corpus_cn.json")
intified_data = intify(parsed_json)
save_json(data = intified_data)

@ -39,15 +39,20 @@ def parse_json(filename):
data = json.load(open(filename,'r')) data = json.load(open(filename,'r'))
return data return data
def intify(data):
for i in data:
i["no"] = int(i["no"])
return data
def in_msg(data): def in_msg(data):
message_i = [] message_i = []
message_full_i = [] message_full_i = []
for i in data: for i in data:
if i["no"] == "2": if i["no"] == 2:
message_i.append(i["glyph"]) message_i.append(i["glyph"])
message_full_i.append(i) message_full_i.append(i)
if i["no"] == "3": if i["no"] == 3:
message_i.append(i["glyph"]) message_i.append(i["glyph"])
message_full_i.append(i) message_full_i.append(i)
@ -58,12 +63,11 @@ def in_msg(data):
def disrupt(data): def disrupt(data):
noise = np.random.randint(1,3) noise = np.random.randint(1,3)
for i in data: for i in data:
i["no"] = int(i["no"])
i["no"] += noise i["no"] += noise
return data return data
def save_json(data): def save_json(data):
with open('noised.json','w', encoding='utf-8') as w_file: with open('corpus_cn.json','w', encoding='utf-8') as w_file:
json.dump(data,w_file, indent=4, ensure_ascii = False) json.dump(data,w_file, indent=4, ensure_ascii = False)
def out_msg(noise_data): def out_msg(noise_data):
@ -94,8 +98,9 @@ def out_msg(noise_data):
if __name__ == "__main__": if __name__ == "__main__":
parsed_json = parse_json(filename = "seed.json") parsed_json = parse_json(filename = "seed.json")
in_msg(data = parsed_json) intified_data = intify(data = parsed_json)
disrupted_data = disrupt(data = parsed_json) in_msg(data = intified_data)
disrupted_data = disrupt(data = intified_data)
save_json(data = disrupted_data) save_json(data = disrupted_data)
parsed_noise_json = parse_json(filename = "noised.json") parsed_noise_json = parse_json(filename = "noised.json")
out_msg(noise_data = parsed_noise_json) out_msg(noise_data = parsed_noise_json)

@ -1,2 +1,3 @@
work on disruptive system text. work on disruptive system text.
cat >> to append new line! cat >> to append new line!
convert key:value value to integers

Loading…
Cancel
Save