made corpus stroke / no entries into integers, do interesting remix operation with integers.
parent
4779ee09c0
commit
0ef587164a
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,26 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"glyph": "hong",
|
||||||
|
"dept-no": "silk-1",
|
||||||
|
"radical": "silk",
|
||||||
|
"no": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"glyph": "jiao",
|
||||||
|
"dept-no": "silk-2",
|
||||||
|
"radical": "silk",
|
||||||
|
"no": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"glyph": "zhu",
|
||||||
|
"dept-no": "silk-3",
|
||||||
|
"radical": "silk",
|
||||||
|
"no": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"glyph": "du",
|
||||||
|
"dept-no": "earth-1",
|
||||||
|
"radical": "earth",
|
||||||
|
"no": 6
|
||||||
|
}
|
||||||
|
]
|
@ -1,6 +1,44 @@
|
|||||||
'''
|
'''
|
||||||
|
Development Notes
|
||||||
|
the program temporarily works with word.json to test the viability of
|
||||||
|
working with large corpus organized as json.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
# choose a corpus to work with
|
import json
|
||||||
# look at corpus structure
|
from pprint import pprint
|
||||||
# browse methods to parse large quantity of corpus
|
|
||||||
|
def parse_json(filename):
|
||||||
|
data = json.load(open(filename,'r'))
|
||||||
|
return data
|
||||||
|
|
||||||
|
def save_json(data):
|
||||||
|
with open('corpus/corpus_cn.json','w', encoding='utf-8') as w_file:
|
||||||
|
json.dump(data,w_file, indent=4, ensure_ascii = False)
|
||||||
|
|
||||||
|
def intify(data):
|
||||||
|
for i in data:
|
||||||
|
print("converting to int")
|
||||||
|
i["strokes"] = int(i["strokes"])
|
||||||
|
print(type(i["strokes"]))
|
||||||
|
return data
|
||||||
|
|
||||||
|
# data sciency methods are not all that interesting, cautiously stay away.
|
||||||
|
# think of what do people do with a dictionary
|
||||||
|
# look for a radical; return results with that radical; like Bok.
|
||||||
|
# some general usages of using dictionary
|
||||||
|
|
||||||
|
# def remix_lookup():
|
||||||
|
|
||||||
|
# def radical_lookup():
|
||||||
|
|
||||||
|
# def stroke_lookup():
|
||||||
|
|
||||||
|
# save corpus as separate file to work with
|
||||||
|
|
||||||
|
|
||||||
|
# browse methods to parse large quantity of corpus
|
||||||
|
|
||||||
|
if __name__== "__main__":
|
||||||
|
parsed_json = parse_json(filename="corpus/corpus_cn.json")
|
||||||
|
intified_data = intify(parsed_json)
|
||||||
|
save_json(data = intified_data)
|
||||||
|
Loading…
Reference in New Issue