|
|
|
'''
|
|
|
|
Development Notes
|
|
|
|
the program temporarily works with word.json to test the viability of
|
|
|
|
working with large corpus organized as json.
|
|
|
|
|
|
|
|
'''
|
|
|
|
import json
|
|
|
|
from pprint import pprint
|
|
|
|
|
|
|
|
def parse_json(filename):
|
|
|
|
data = json.load(open(filename,'r'))
|
|
|
|
return data
|
|
|
|
|
|
|
|
def save_json(data):
|
|
|
|
with open('corpus/corpus_cn.json','w', encoding='utf-8') as w_file:
|
|
|
|
json.dump(data,w_file, indent=4, ensure_ascii = False)
|
|
|
|
|
|
|
|
def intify(data):
|
|
|
|
for i in data:
|
|
|
|
print("converting to int")
|
|
|
|
i["strokes"] = int(i["strokes"])
|
|
|
|
print(type(i["strokes"]))
|
|
|
|
return data
|
|
|
|
|
|
|
|
# data sciency methods are not all that interesting, cautiously stay away.
|
|
|
|
# think of what do people do with a dictionary
|
|
|
|
# look for a radical; return results with that radical; like Bok.
|
|
|
|
# some general usages of using dictionary
|
|
|
|
|
|
|
|
# def remix_lookup():
|
|
|
|
|
|
|
|
# def radical_lookup():
|
|
|
|
|
|
|
|
# def stroke_lookup():
|
|
|
|
|
|
|
|
# save corpus as separate file to work with
|
|
|
|
|
|
|
|
|
|
|
|
# browse methods to parse large quantity of corpus
|
|
|
|
|
|
|
|
if __name__== "__main__":
|
|
|
|
parsed_json = parse_json(filename="corpus/corpus_cn.json")
|
|
|
|
intified_data = intify(parsed_json)
|
|
|
|
save_json(data = intified_data)
|