diff --git a/sample_dict_json/dictionary.json b/corpus/dictionary.json similarity index 100% rename from sample_dict_json/dictionary.json rename to corpus/dictionary.json diff --git a/sample_dict_json/word.json b/corpus/word.json similarity index 100% rename from sample_dict_json/word.json rename to corpus/word.json diff --git a/sample_dict_json/xinhua.csv b/corpus/xinhua.csv similarity index 100% rename from sample_dict_json/xinhua.csv rename to corpus/xinhua.csv diff --git a/large_corpus.py b/large_corpus.py new file mode 100644 index 0000000..9ea2de4 --- /dev/null +++ b/large_corpus.py @@ -0,0 +1,6 @@ +''' + +''' +# choose a corpus to work with +# look at corpus structure +# browse methods to parse large quantity of corpus \ No newline at end of file diff --git a/dict_test.py b/mini_corpus.py similarity index 96% rename from dict_test.py rename to mini_corpus.py index 1d60774..5e4a1e3 100644 --- a/dict_test.py +++ b/mini_corpus.py @@ -87,6 +87,7 @@ def out_msg(noise_data): # at this point the interferences are somewhat apparent # how can i present the interference to be more apparent? +# do i need to work with a really large corpus to make it apparent? # todo # input chinese blocks in here diff --git a/run.sh b/run.sh index d53017c..d17f4a0 100755 --- a/run.sh +++ b/run.sh @@ -1,5 +1,5 @@ #!/bin/bash # run python script -python3 dict_test.py +python3 mini_corpus.py # cat result cat new.json \ No newline at end of file