You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

647 lines
30 KiB
Plaintext

3 years ago
{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"id": "03589aa7-7fc4-4fe1-92ac-9f523e2ba90c",
"metadata": {},
"outputs": [],
"source": [
"# print ascii encoding of a character"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "e068c988-0a8a-45d8-ac64-ea07cf37b275",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"'\\\\u4f60'\""
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ascii(\"你\")"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "0760b0c9-17f7-4fca-87a1-2c3423aaf61f",
"metadata": {},
"outputs": [],
"source": [
"# what is this ascii standard?"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "44976d3e-c6a0-4d2e-8e3a-9d21ac254803",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"'a'\""
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ascii(\"a\")"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "8ffd857f-e182-49d0-9722-f8d2204f9054",
"metadata": {},
"outputs": [],
"source": [
"# try ord "
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "97c7c7d1-b79f-42ac-8d00-81c12b428281",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"97"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ord(\"a\")"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "9477f26f-3e9c-4a4c-be1b-1c135cd3688e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"65"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ord(\"A\")"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "f0bb5070-d333-46c2-bdea-da6404989ba3",
"metadata": {},
"outputs": [],
"source": [
"#Print alphabet lower and upper case\n",
"alphabet = [\"a\",\"b\",\"c\",\"d\",\"e\",\"f\",\"g\",\"h\",\"i\",\"j\",\"k\",\"l\",\"m\",\"n\",\"o\",\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
"ALPHABET = []\n",
"for letter in alphabet: ALPHABET.append(letter.upper())"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "b53660cc-cee1-4d69-8498-c94edd84e5aa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Printing corresponding ASCII numerical representation :\n",
"a :97 b :98 c :99 d :100 e :101 f :102 g :103 h :104 i :105 j :106 k :107 l :108 m :109 n :110 o :111 p :112 q :113 r :114 s :115 t :116 u :117 v :118 w :119 x :120 y :121 z :122 "
]
}
],
"source": [
"print(\"Printing corresponding ASCII numerical representation :\")\n",
"for letter in alphabet: \n",
" print(\"{} :\".format (letter) + str (ord(letter)), end = \" \")"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "4f7ec4ff-b6e6-4cda-b8a3-6e2a3521d9c8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Printing corresponding ASCII numerical representation :\n",
"A :65 B :66 C :67 D :68 E :69 F :70 G :71 H :72 I :73 J :74 K :75 L :76 M :77 N :78 O :79 P :80 Q :81 R :82 S :83 T :84 U :85 V :86 W :87 X :88 Y :89 Z :90 "
]
}
],
"source": [
"print(\"Printing corresponding ASCII numerical representation :\")\n",
"for LETTER in ALPHABET: \n",
" print(\"{} :\".format (LETTER) + str (ord(LETTER)), end = \" \")"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "87963ac8-050c-44a5-9f5a-af94e5ba40ce",
"metadata": {},
"outputs": [],
"source": [
"# the lower case letter representing the same letter is +32 of the capital letter, and thus the following hack \n",
"# without using the upper() method in Python"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "6318993c-1f92-4aa5-befd-2e01963bda15",
"metadata": {},
"outputs": [],
"source": [
"# takes input of a lower case char\n",
"def upperHack(c):\n",
" # get number of the lower case letter\n",
" l_num = ord(c)\n",
" u_num = l_num - 32\n",
" # return the upper case letter from u_num with chr()method\n",
" return chr(u_num)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "b47bae87-4e21-4a9c-a90a-6f999df2ac95",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'C'"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"upperHack(\"c\")"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "f0c628f3-e92c-446d-81c4-9cc7149b3a1c",
"metadata": {},
"outputs": [],
"source": [
"# a vice versa method\n",
"def lowerHack(C):\n",
" # get number of the lower case letter\n",
" u_num = ord(C)\n",
" l_num = u_num + 32\n",
" # return the upper case letter from u_num with chr()method\n",
" return chr(l_num)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "46a9f58f-5428-4901-85f4-73542d8110fd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'c'"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lowerHack(\"C\")"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "a467b7a2-1ea5-4148-bfa4-4424ec110465",
"metadata": {},
"outputs": [],
"source": [
"# 好,现在我们把西文的方法应用到中文里去\n",
"# 拉丁语系下的西文中的词由字母构成;中文里的字由笔划构成。\n",
"# 我们先从系统层面思考笔划是构造单个汉字的材料在python中可以实现对笔划的探索性编程exploratory programming\n",
"# 或者,我们可以在“语料库”的系统层面思考。\n",
"#《千字文》中包含了一千个不重复的汉字,我们可以理解为这是古人的“识字课本”。\n",
"# 汉字和千字文之间个体与系统的关系,好比“永字八法”中,笔划和“永”字之间的关系。\n",
"# 以王羲之的永字为范本,学童反复摹习,掌握“永”字中的“侧、勒、弩、趯、策、掠、啄、磔”,以期掌握写好每一个汉字。\n",
"# 同样,以千字文为范本,学童反复背诵研习,为了识字,并以“字”为单位行文。"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "92840cd9-b859-4c27-9016-80608168e448",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"天地玄黄宇宙洪荒。日月盈昃辰宿列张。4\n",
"寒来暑往秋收冬藏。闰余成岁律吕调阳。8\n",
"云腾致雨露结为霜。金生丽水玉出昆冈。12\n",
"剑号巨阙珠称夜光。果珍李柰菜重芥姜。16\n",
"海咸河淡鳞潜羽翔。龙师火帝鸟官人皇。20\n",
"始制文字乃服衣裳。推位让国有虞陶唐。24\n",
"吊民伐罪周发殷汤。坐朝问道垂拱平章。28\n",
"爱育黎首臣伏戎羌。遐迩壹体率宾归王。32\n",
"鸣凤在树白驹食场。化被草木赖及万方。36\n",
"盖此身发四大五常。恭惟鞠养岂敢毁伤。40\n",
"女慕贞洁男效才良。知过必改得能莫忘。44\n",
"罔谈彼短靡恃己长。信使可覆器欲难量。48\n",
"墨悲丝染诗赞羔羊。景行维贤克念作圣。52\n",
"德建名立形端表正。空谷传声虚堂习听。56\n",
"祸因恶积福缘善庆。尺璧非宝寸阴是竞。60\n",
"资父事君曰严与敬。孝当竭力忠则尽命。64\n",
"临深履薄夙兴温凊。似兰斯馨如松之盛。68\n",
"川流不息渊澄取映。容止若思言辞安定。72\n",
"笃初诚美慎终宜令。荣业所基藉甚无竟。76\n",
"学优登仕摄职从政。存以甘棠去而益咏。80\n",
"乐殊贵贱礼别尊卑。上和下睦夫唱妇随。84\n",
"外受傅训入奉母仪。诸姑伯叔犹子比儿。88\n",
"孔怀兄弟同气连枝。交友投分切磨箴规。92\n",
"仁慈隐恻造次弗离。节义廉退颠沛匪亏。96\n",
"性静情逸心动神疲。守真志满逐物意移。100\n",
"坚持雅操好爵自縻。都邑华夏东西二京。104\n",
"背邙面洛浮渭据泾。宫殿盘郁楼观飞惊。108\n",
"图写禽兽画彩仙灵。丙舍傍启甲帐对楹。112\n",
"肆筵设席鼓瑟吹笙。升阶纳陛弁转疑星。116\n",
"右通广内左达承明。既集坟典亦聚群英。120\n",
"杜稿钟隶漆书壁经。府罗将相路侠槐卿。124\n",
"户封八县家给千兵。高冠陪辇驱毂振缨。128\n",
"世禄侈富车驾肥轻。策功茂实勒碑刻铭。132\n",
"磻溪伊尹佐时阿衡。奄宅曲阜微旦孰营。136\n",
"桓公匡合济弱扶倾。绮回汉惠说感武丁。140\n",
"俊乂密勿多士实宁。晋楚更霸赵魏困横。144\n",
"假途灭虢践土会盟。何遵约法韩弊烦刑。148\n",
"起翦颇牧用军最精。宣威沙漠驰誉丹青。152\n",
"九州禹迹百郡秦并。岳宗恒岱禅主云亭。156\n",
"雁门紫塞鸡田赤城。昆池碣石巨野洞庭。160\n",
"旷远绵邈岩岫杳冥。治本于农务兹稼穑。164\n",
"俶载南亩我艺黍稷。税熟贡新劝赏黜陟。168\n",
"孟轲敦素史鱼秉直。庶几中庸劳谦谨敕。172\n",
"聆音察理鉴貌辨色。贻厥嘉猷勉其祗植。176\n",
"省躬讥诫宠增抗极。殆辱近耻林皋幸即。180\n",
"两疏见机解组谁逼。索居闲处沉默寂寥。184\n",
"求古寻论散虑逍遥。欣奏累遣戚谢欢招。188\n",
"渠荷的历园莽抽条。枇杷晚翠梧桐早凋。192\n",
"陈根委翳落叶飘摇。游鹍独运凌摩绛霄。196\n",
"耽读玩市寓目囊箱。易輶攸畏属耳垣墙。200\n",
"具膳餐饭适口充肠。饱饫烹宰饥厌糟糠。204\n",
"亲戚故旧老少异粮。妾御绩纺侍巾帷房。208\n",
"纨扇圆洁银烛炜煌。昼眠夕寐蓝笋象床。212\n",
"弦歌酒宴接杯举觞。矫手顿足悦豫且康。216\n",
"嫡后嗣续祭祀烝尝。稽颡再拜悚惧恐惶。220\n",
"笺牒简要顾答审详。骸垢想浴执热愿凉。224\n",
"驴骡犊特骇跃超骧。诛斩贼盗捕获叛亡。228\n",
"布射辽丸嵇琴阮啸。恬笔伦纸钧巧任钓。232\n",
"释纷利俗并皆佳妙。毛施淑姿工颦妍笑。236\n",
"年矢每催曦晖朗曜。璇玑悬斡晦魄环照。240\n",
"指薪修祜永绥吉劭。矩步引领俯仰廊庙。244\n",
"束带矜庄徘徊瞻眺。孤陋寡闻愚蒙等诮。248\n",
"谓语助者焉哉乎也。250\n"
]
}
],
"source": [
"# read in \"A Thousand Character Essay\"\n",
"thousand_w = open(\"files/thousand_char_essay.txt\",\"r\")\n",
"corpus = thousand_w.read(None)\n",
"print(corpus)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "23892e53-9d2e-4d93-b98f-e4fb08ad910c",
"metadata": {},
"outputs": [],
"source": [
"# remove counter numbers from corpus \n",
"clean_corpus=[]\n",
"for word in corpus:\n",
" if word.isdigit() is not True:\n",
" clean_corpus.append(word)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "b6b6892e-6b4f-4d54-a466-c2c5ec04f4c9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"天地玄黄,宇宙洪荒。日月盈昃,辰宿列张。\n",
"寒来暑往,秋收冬藏。闰余成岁,律吕调阳。\n",
"云腾致雨,露结为霜。金生丽水,玉出昆冈。\n",
"剑号巨阙,珠称夜光。果珍李柰,菜重芥姜。\n",
"海咸河淡,鳞潜羽翔。龙师火帝,鸟官人皇。\n",
"始制文字,乃服衣裳。推位让国,有虞陶唐。\n",
"吊民伐罪,周发殷汤。坐朝问道,垂拱平章。\n",
"爱育黎首,臣伏戎羌。遐迩壹体,率宾归王。\n",
"鸣凤在树,白驹食场。化被草木,赖及万方。\n",
"盖此身发,四大五常。恭惟鞠养,岂敢毁伤。\n",
"女慕贞洁,男效才良。知过必改,得能莫忘。\n",
"罔谈彼短,靡恃己长。信使可覆,器欲难量。\n",
"墨悲丝染,诗赞羔羊。景行维贤,克念作圣。\n",
"德建名立,形端表正。空谷传声,虚堂习听。\n",
"祸因恶积,福缘善庆。尺璧非宝,寸阴是竞。\n",
"资父事君,曰严与敬。孝当竭力,忠则尽命。\n",
"临深履薄,夙兴温凊。似兰斯馨,如松之盛。\n",
"川流不息,渊澄取映。容止若思,言辞安定。\n",
"笃初诚美,慎终宜令。荣业所基,藉甚无竟。\n",
"学优登仕,摄职从政。存以甘棠,去而益咏。\n",
"乐殊贵贱,礼别尊卑。上和下睦,夫唱妇随。\n",
"外受傅训,入奉母仪。诸姑伯叔,犹子比儿。\n",
"孔怀兄弟,同气连枝。交友投分,切磨箴规。\n",
"仁慈隐恻,造次弗离。节义廉退,颠沛匪亏。\n",
"性静情逸,心动神疲。守真志满,逐物意移。\n",
"坚持雅操,好爵自縻。都邑华夏,东西二京。\n",
"背邙面洛,浮渭据泾。宫殿盘郁,楼观飞惊。\n",
"图写禽兽,画彩仙灵。丙舍傍启,甲帐对楹。\n",
"肆筵设席,鼓瑟吹笙。升阶纳陛,弁转疑星。\n",
"右通广内,左达承明。既集坟典,亦聚群英。\n",
"杜稿钟隶,漆书壁经。府罗将相,路侠槐卿。\n",
"户封八县,家给千兵。高冠陪辇,驱毂振缨。\n",
"世禄侈富,车驾肥轻。策功茂实,勒碑刻铭。\n",
"磻溪伊尹,佐时阿衡。奄宅曲阜,微旦孰营。\n",
"桓公匡合,济弱扶倾。绮回汉惠,说感武丁。\n",
"俊乂密勿,多士实宁。晋楚更霸,赵魏困横。\n",
"假途灭虢,践土会盟。何遵约法,韩弊烦刑。\n",
"起翦颇牧,用军最精。宣威沙漠,驰誉丹青。\n",
"九州禹迹,百郡秦并。岳宗恒岱,禅主云亭。\n",
"雁门紫塞,鸡田赤城。昆池碣石,巨野洞庭。\n",
"旷远绵邈,岩岫杳冥。治本于农,务兹稼穑。\n",
"俶载南亩,我艺黍稷。税熟贡新,劝赏黜陟。\n",
"孟轲敦素,史鱼秉直。庶几中庸,劳谦谨敕。\n",
"聆音察理,鉴貌辨色。贻厥嘉猷,勉其祗植。\n",
"省躬讥诫,宠增抗极。殆辱近耻,林皋幸即。\n",
"两疏见机,解组谁逼。索居闲处,沉默寂寥。\n",
"求古寻论,散虑逍遥。欣奏累遣,戚谢欢招。\n",
"渠荷的历,园莽抽条。枇杷晚翠,梧桐早凋。\n",
"陈根委翳,落叶飘摇。游鹍独运,凌摩绛霄。\n",
"耽读玩市,寓目囊箱。易輶攸畏,属耳垣墙。\n",
"具膳餐饭,适口充肠。饱饫烹宰,饥厌糟糠。\n",
"亲戚故旧,老少异粮。妾御绩纺,侍巾帷房。\n",
"纨扇圆洁,银烛炜煌。昼眠夕寐,蓝笋象床。\n",
"弦歌酒宴,接杯举觞。矫手顿足,悦豫且康。\n",
"嫡后嗣续,祭祀烝尝。稽颡再拜,悚惧恐惶。\n",
"笺牒简要,顾答审详。骸垢想浴,执热愿凉。\n",
"驴骡犊特,骇跃超骧。诛斩贼盗,捕获叛亡。\n",
"布射辽丸,嵇琴阮啸。恬笔伦纸,钧巧任钓。\n",
"释纷利俗,并皆佳妙。毛施淑姿,工颦妍笑。\n",
"年矢每催,曦晖朗曜。璇玑悬斡,晦魄环照。\n",
"指薪修祜,永绥吉劭。矩步引领,俯仰廊庙。\n",
"束带矜庄,徘徊瞻眺。孤陋寡闻,愚蒙等诮。\n",
"谓语助者,焉哉乎也。\n"
]
}
],
"source": [
"# print(clean_corpus)\n",
"clean_corpus_s = \" \"\n",
"clean_corpus_s = \"\".join(str(x) for x in clean_corpus)\n",
"print(clean_corpus_s)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "34dfbc8f-df1b-4969-a5fb-d073a08c9a32",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"'\\u5929''\\u5730''\\u7384''\\u9ec4''\\u5b87''\\u5b99''\\u6d2a''\\u8352'。'\\u65e5''\\u6708''\\u76c8''\\u6603''\\u8fb0''\\u5bbf''\\u5217''\\u5f20'。\n",
"'\\u5bd2''\\u6765''\\u6691''\\u5f80''\\u79cb''\\u6536''\\u51ac''\\u85cf'。'\\u95f0''\\u4f59''\\u6210''\\u5c81''\\u5f8b''\\u5415''\\u8c03''\\u9633'。\n",
"'\\u4e91''\\u817e''\\u81f4''\\u96e8''\\u9732''\\u7ed3''\\u4e3a''\\u971c'。'\\u91d1''\\u751f''\\u4e3d''\\u6c34''\\u7389''\\u51fa''\\u6606''\\u5188'。\n",
"'\\u5251''\\u53f7''\\u5de8''\\u9619''\\u73e0''\\u79f0''\\u591c''\\u5149'。'\\u679c''\\u73cd''\\u674e''\\u67f0''\\u83dc''\\u91cd''\\u82a5''\\u59dc'。\n",
"'\\u6d77''\\u54b8''\\u6cb3''\\u6de1''\\u9cde''\\u6f5c''\\u7fbd''\\u7fd4'。'\\u9f99''\\u5e08''\\u706b''\\u5e1d''\\u9e1f''\\u5b98''\\u4eba''\\u7687'。\n",
"'\\u59cb''\\u5236''\\u6587''\\u5b57''\\u4e43''\\u670d''\\u8863''\\u88f3'。'\\u63a8''\\u4f4d''\\u8ba9''\\u56fd''\\u6709''\\u865e''\\u9676''\\u5510'。\n",
"'\\u540a''\\u6c11''\\u4f10''\\u7f6a''\\u5468''\\u53d1''\\u6bb7''\\u6c64'。'\\u5750''\\u671d''\\u95ee''\\u9053''\\u5782''\\u62f1''\\u5e73''\\u7ae0'。\n",
"'\\u7231''\\u80b2''\\u9ece''\\u9996''\\u81e3''\\u4f0f''\\u620e''\\u7f8c'。'\\u9050''\\u8fe9''\\u58f9''\\u4f53''\\u7387''\\u5bbe''\\u5f52''\\u738b'。\n",
"'\\u9e23''\\u51e4''\\u5728''\\u6811''\\u767d''\\u9a79''\\u98df''\\u573a'。'\\u5316''\\u88ab''\\u8349''\\u6728''\\u8d56''\\u53ca''\\u4e07''\\u65b9'。\n",
"'\\u76d6''\\u6b64''\\u8eab''\\u53d1''\\u56db''\\u5927''\\u4e94''\\u5e38'。'\\u606d''\\u60df''\\u97a0''\\u517b''\\u5c82''\\u6562''\\u6bc1''\\u4f24'。\n",
"'\\u5973''\\u6155''\\u8d1e''\\u6d01''\\u7537''\\u6548''\\u624d''\\u826f'。'\\u77e5''\\u8fc7''\\u5fc5''\\u6539''\\u5f97''\\u80fd''\\u83ab''\\u5fd8'。\n",
"'\\u7f54''\\u8c08''\\u5f7c''\\u77ed''\\u9761''\\u6043''\\u5df1''\\u957f'。'\\u4fe1''\\u4f7f''\\u53ef''\\u8986''\\u5668''\\u6b32''\\u96be''\\u91cf'。\n",
"'\\u58a8''\\u60b2''\\u4e1d''\\u67d3''\\u8bd7''\\u8d5e''\\u7f94''\\u7f8a'。'\\u666f''\\u884c''\\u7ef4''\\u8d24''\\u514b''\\u5ff5''\\u4f5c''\\u5723'。\n",
"'\\u5fb7''\\u5efa''\\u540d''\\u7acb''\\u5f62''\\u7aef''\\u8868''\\u6b63'。'\\u7a7a''\\u8c37''\\u4f20''\\u58f0''\\u865a''\\u5802''\\u4e60''\\u542c'。\n",
"'\\u7978''\\u56e0''\\u6076''\\u79ef''\\u798f''\\u7f18''\\u5584''\\u5e86'。'\\u5c3a''\\u74a7''\\u975e''\\u5b9d''\\u5bf8''\\u9634''\\u662f''\\u7ade'。\n",
"'\\u8d44''\\u7236''\\u4e8b''\\u541b''\\u66f0''\\u4e25''\\u4e0e''\\u656c'。'\\u5b5d''\\u5f53''\\u7aed''\\u529b''\\u5fe0''\\u5219''\\u5c3d''\\u547d'。\n",
"'\\u4e34''\\u6df1''\\u5c65''\\u8584''\\u5919''\\u5174''\\u6e29''\\u51ca'。'\\u4f3c''\\u5170''\\u65af''\\u99a8''\\u5982''\\u677e''\\u4e4b''\\u76db'。\n",
"'\\u5ddd''\\u6d41''\\u4e0d''\\u606f''\\u6e0a''\\u6f84''\\u53d6''\\u6620'。'\\u5bb9''\\u6b62''\\u82e5''\\u601d''\\u8a00''\\u8f9e''\\u5b89''\\u5b9a'。\n",
"'\\u7b03''\\u521d''\\u8bda''\\u7f8e''\\u614e''\\u7ec8''\\u5b9c''\\u4ee4'。'\\u8363''\\u4e1a''\\u6240''\\u57fa''\\u85c9''\\u751a''\\u65e0''\\u7adf'。\n",
"'\\u5b66''\\u4f18''\\u767b''\\u4ed5''\\u6444''\\u804c''\\u4ece''\\u653f'。'\\u5b58''\\u4ee5''\\u7518''\\u68e0''\\u53bb''\\u800c''\\u76ca''\\u548f'。\n",
"'\\u4e50''\\u6b8a''\\u8d35''\\u8d31''\\u793c''\\u522b''\\u5c0a''\\u5351'。'\\u4e0a''\\u548c''\\u4e0b''\\u7766''\\u592b''\\u5531''\\u5987''\\u968f'。\n",
"'\\u5916''\\u53d7''\\u5085''\\u8bad''\\u5165''\\u5949''\\u6bcd''\\u4eea'。'\\u8bf8''\\u59d1''\\u4f2f''\\u53d4''\\u72b9''\\u5b50''\\u6bd4''\\u513f'。\n",
"'\\u5b54''\\u6000''\\u5144''\\u5f1f''\\u540c''\\u6c14''\\u8fde''\\u679d'。'\\u4ea4''\\u53cb''\\u6295''\\u5206''\\u5207''\\u78e8''\\u7bb4''\\u89c4'。\n",
"'\\u4ec1''\\u6148''\\u9690''\\u607b''\\u9020''\\u6b21''\\u5f17''\\u79bb'。'\\u8282''\\u4e49''\\u5ec9''\\u9000''\\u98a0''\\u6c9b''\\u532a''\\u4e8f'。\n",
"'\\u6027''\\u9759''\\u60c5''\\u9038''\\u5fc3''\\u52a8''\\u795e''\\u75b2'。'\\u5b88''\\u771f''\\u5fd7''\\u6ee1''\\u9010''\\u7269''\\u610f''\\u79fb'。\n",
"'\\u575a''\\u6301''\\u96c5''\\u64cd''\\u597d''\\u7235''\\u81ea''\\u7e3b'。'\\u90fd''\\u9091''\\u534e''\\u590f''\\u4e1c''\\u897f''\\u4e8c''\\u4eac'。\n",
"'\\u80cc''\\u9099''\\u9762''\\u6d1b''\\u6d6e''\\u6e2d''\\u636e''\\u6cfe'。'\\u5bab''\\u6bbf''\\u76d8''\\u90c1''\\u697c''\\u89c2''\\u98de''\\u60ca'。\n",
"'\\u56fe''\\u5199''\\u79bd''\\u517d''\\u753b''\\u5f69''\\u4ed9''\\u7075'。'\\u4e19''\\u820d''\\u508d''\\u542f''\\u7532''\\u5e10''\\u5bf9''\\u6979'。\n",
"'\\u8086''\\u7b75''\\u8bbe''\\u5e2d''\\u9f13''\\u745f''\\u5439''\\u7b19'。'\\u5347''\\u9636''\\u7eb3''\\u965b''\\u5f01''\\u8f6c''\\u7591''\\u661f'。\n",
"'\\u53f3''\\u901a''\\u5e7f''\\u5185''\\u5de6''\\u8fbe''\\u627f''\\u660e'。'\\u65e2''\\u96c6''\\u575f''\\u5178''\\u4ea6''\\u805a''\\u7fa4''\\u82f1'。\n",
"'\\u675c''\\u7a3f''\\u949f''\\u96b6''\\u6f06''\\u4e66''\\u58c1''\\u7ecf'。'\\u5e9c''\\u7f57''\\u5c06''\\u76f8''\\u8def''\\u4fa0''\\u69d0''\\u537f'。\n",
"'\\u6237''\\u5c01''\\u516b''\\u53bf''\\u5bb6''\\u7ed9''\\u5343''\\u5175'。'\\u9ad8''\\u51a0''\\u966a''\\u8f87''\\u9a71''\\u6bc2''\\u632f''\\u7f28'。\n",
"'\\u4e16''\\u7984''\\u4f88''\\u5bcc''\\u8f66''\\u9a7e''\\u80a5''\\u8f7b'。'\\u7b56''\\u529f''\\u8302''\\u5b9e''\\u52d2''\\u7891''\\u523b''\\u94ed'。\n",
"'\\u78fb''\\u6eaa''\\u4f0a''\\u5c39''\\u4f50''\\u65f6''\\u963f''\\u8861'。'\\u5944''\\u5b85''\\u66f2''\\u961c''\\u5fae''\\u65e6''\\u5b70''\\u8425'。\n",
"'\\u6853''\\u516c''\\u5321''\\u5408''\\u6d4e''\\u5f31''\\u6276''\\u503e'。'\\u7eee''\\u56de''\\u6c49''\\u60e0''\\u8bf4''\\u611f''\\u6b66''\\u4e01'。\n",
"'\\u4fca''\\u4e42''\\u5bc6''\\u52ff''\\u591a''\\u58eb''\\u5b9e''\\u5b81'。'\\u664b''\\u695a''\\u66f4''\\u9738''\\u8d75''\\u9b4f''\\u56f0''\\u6a2a'。\n",
"'\\u5047''\\u9014''\\u706d''\\u8662''\\u8df5''\\u571f''\\u4f1a''\\u76df'。'\\u4f55''\\u9075''\\u7ea6''\\u6cd5''\\u97e9''\\u5f0a''\\u70e6''\\u5211'。\n",
"'\\u8d77''\\u7fe6''\\u9887''\\u7267''\\u7528''\\u519b''\\u6700''\\u7cbe'。'\\u5ba3''\\u5a01''\\u6c99''\\u6f20''\\u9a70''\\u8a89''\\u4e39''\\u9752'。\n",
"'\\u4e5d''\\u5dde''\\u79b9''\\u8ff9''\\u767e''\\u90e1''\\u79e6''\\u5e76'。'\\u5cb3''\\u5b97''\\u6052''\\u5cb1''\\u7985''\\u4e3b''\\u4e91''\\u4ead'。\n",
"'\\u96c1''\\u95e8''\\u7d2b''\\u585e''\\u9e21''\\u7530''\\u8d64''\\u57ce'。'\\u6606''\\u6c60''\\u78a3''\\u77f3''\\u5de8''\\u91ce''\\u6d1e''\\u5ead'。\n",
"'\\u65f7''\\u8fdc''\\u7ef5''\\u9088''\\u5ca9''\\u5cab''\\u6773''\\u51a5'。'\\u6cbb''\\u672c''\\u4e8e''\\u519c''\\u52a1''\\u5179''\\u7a3c''\\u7a51'。\n",
"'\\u4ff6''\\u8f7d''\\u5357''\\u4ea9''\\u6211''\\u827a''\\u9ecd''\\u7a37'。'\\u7a0e''\\u719f''\\u8d21''\\u65b0''\\u529d''\\u8d4f''\\u9edc''\\u965f'。\n",
"'\\u5b5f''\\u8f72''\\u6566''\\u7d20''\\u53f2''\\u9c7c''\\u79c9''\\u76f4'。'\\u5eb6''\\u51e0''\\u4e2d''\\u5eb8''\\u52b3''\\u8c26''\\u8c28''\\u6555'。\n",
"'\\u8046''\\u97f3''\\u5bdf''\\u7406''\\u9274''\\u8c8c''\\u8fa8''\\u8272'。'\\u8d3b''\\u53a5''\\u5609''\\u7337''\\u52c9''\\u5176''\\u7957''\\u690d'。\n",
"'\\u7701''\\u8eac''\\u8ba5''\\u8beb''\\u5ba0''\\u589e''\\u6297''\\u6781'。'\\u6b86''\\u8fb1''\\u8fd1''\\u803b''\\u6797''\\u768b''\\u5e78''\\u5373'。\n",
"'\\u4e24''\\u758f''\\u89c1''\\u673a''\\u89e3''\\u7ec4''\\u8c01''\\u903c'。'\\u7d22''\\u5c45''\\u95f2''\\u5904''\\u6c89''\\u9ed8''\\u5bc2''\\u5be5'。\n",
"'\\u6c42''\\u53e4''\\u5bfb''\\u8bba''\\u6563''\\u8651''\\u900d''\\u9065'。'\\u6b23''\\u594f''\\u7d2f''\\u9063''\\u621a''\\u8c22''\\u6b22''\\u62db'。\n",
"'\\u6e20''\\u8377''\\u7684''\\u5386''\\u56ed''\\u83bd''\\u62bd''\\u6761'。'\\u6787''\\u6777''\\u665a''\\u7fe0''\\u68a7''\\u6850''\\u65e9''\\u51cb'。\n",
"'\\u9648''\\u6839''\\u59d4''\\u7ff3''\\u843d''\\u53f6''\\u98d8''\\u6447'。'\\u6e38''\\u9e4d''\\u72ec''\\u8fd0''\\u51cc''\\u6469''\\u7edb''\\u9704'。\n",
"'\\u803d''\\u8bfb''\\u73a9''\\u5e02''\\u5bd3''\\u76ee''\\u56ca''\\u7bb1'。'\\u6613''\\u8f36''\\u6538''\\u754f''\\u5c5e''\\u8033''\\u57a3''\\u5899'。\n",
"'\\u5177''\\u81b3''\\u9910''\\u996d''\\u9002''\\u53e3''\\u5145''\\u80a0'。'\\u9971''\\u996b''\\u70f9''\\u5bb0''\\u9965''\\u538c''\\u7cdf''\\u7ce0'。\n",
"'\\u4eb2''\\u621a''\\u6545''\\u65e7''\\u8001''\\u5c11''\\u5f02''\\u7cae'。'\\u59be''\\u5fa1''\\u7ee9''\\u7eba''\\u4f8d''\\u5dfe''\\u5e37''\\u623f'。\n",
"'\\u7ea8''\\u6247''\\u5706''\\u6d01''\\u94f6''\\u70db''\\u709c''\\u714c'。'\\u663c''\\u7720''\\u5915''\\u5bd0''\\u84dd''\\u7b0b''\\u8c61''\\u5e8a'。\n",
"'\\u5f26''\\u6b4c''\\u9152''\\u5bb4''\\u63a5''\\u676f''\\u4e3e''\\u89de'。'\\u77eb''\\u624b''\\u987f''\\u8db3''\\u60a6''\\u8c6b''\\u4e14''\\u5eb7'。\n",
"'\\u5ae1''\\u540e''\\u55e3''\\u7eed''\\u796d''\\u7940''\\u70dd''\\u5c1d'。'\\u7a3d''\\u98a1''\\u518d''\\u62dc''\\u609a''\\u60e7''\\u6050''\\u60f6'。\n",
"'\\u7b3a''\\u7252''\\u7b80''\\u8981''\\u987e''\\u7b54''\\u5ba1''\\u8be6'。'\\u9ab8''\\u57a2''\\u60f3''\\u6d74''\\u6267''\\u70ed''\\u613f''\\u51c9'。\n",
"'\\u9a74''\\u9aa1''\\u728a''\\u7279''\\u9a87''\\u8dc3''\\u8d85''\\u9aa7'。'\\u8bdb''\\u65a9''\\u8d3c''\\u76d7''\\u6355''\\u83b7''\\u53db''\\u4ea1'。\n",
"'\\u5e03''\\u5c04''\\u8fbd''\\u4e38''\\u5d47''\\u7434''\\u962e''\\u5578'。'\\u606c''\\u7b14''\\u4f26''\\u7eb8''\\u94a7''\\u5de7''\\u4efb''\\u9493'。\n",
"'\\u91ca''\\u7eb7''\\u5229''\\u4fd7''\\u5e76''\\u7686''\\u4f73''\\u5999'。'\\u6bdb''\\u65bd''\\u6dd1''\\u59ff''\\u5de5''\\u98a6''\\u598d''\\u7b11'。\n",
"'\\u5e74''\\u77e2''\\u6bcf''\\u50ac''\\u66e6''\\u6656''\\u6717''\\u66dc'。'\\u7487''\\u7391''\\u60ac''\\u65a1''\\u6666''\\u9b44''\\u73af''\\u7167'。\n",
"'\\u6307''\\u85aa''\\u4fee''\\u795c''\\u6c38''\\u7ee5''\\u5409''\\u52ad'。'\\u77e9''\\u6b65''\\u5f15''\\u9886''\\u4fef''\\u4ef0''\\u5eca''\\u5e99'。\n",
"'\\u675f''\\u5e26''\\u77dc''\\u5e84''\\u5f98''\\u5f8a''\\u77bb''\\u773a'。'\\u5b64''\\u964b''\\u5be1''\\u95fb''\\u611a''\\u8499''\\u7b49''\\u8bee'。\n",
"'\\u8c13''\\u8bed''\\u52a9''\\u8005''\\u7109''\\u54c9''\\u4e4e''\\u4e5f'。\n"
]
}
],
"source": [
"import re \n",
"# print the ascii number of each text\n",
"punctuation = [\"\",\"。\"]\n",
"line_break = \"\\n\"\n",
"ascii_corpus = \"\"\n",
"ascii_num = \"\"\n",
"\n",
"#ascii_corpus = \"\".join(x for x in clean_corpus)\n",
"\n",
"\n",
"for character in clean_corpus_s:\n",
" #print puntuation as is\n",
" if character in punctuation: \n",
" #print(character)\n",
" # TODO remove unicode \\u\n",
" ascii_corpus += character\n",
" elif character == line_break:\n",
" pass\n",
" ascii_corpus += \"\\n\"\n",
" else:\n",
" ascii_num = ascii(character)\n",
" #print(ascii_num)\n",
" ascii_corpus += ascii_num\n",
"print(ascii_corpus)\n"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "78e14928-2ac6-48ca-be8d-1ecc73f6603d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8312"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# write corpus to a new file\n",
"ascii_output = open(\"files/ascii_output.txt\",\"w\")\n",
"ascii_output.write(ascii_corpus)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "b930f8f8-33d8-4856-9704-064bda5e88a9",
"metadata": {},
"outputs": [],
"source": [
"# more unicode howto\n",
"# https://docs.python.org/3/howto/unicode.html"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "ee668d04-d1b5-4c6e-8868-f500015c8934",
"metadata": {},
"outputs": [],
"source": [
"# preserve the original punctuation and layout\n",
"# need to turn the list back into a string to preserve string layout? "
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "862091df-0d0b-449a-a402-e3ecf007c252",
"metadata": {},
"outputs": [],
"source": [
"thousand_w.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3736d5e5-c332-4ab7-8333-9c1efd4e62c8",
"metadata": {},
"outputs": [],
"source": [
"# https://github.com/callmefeifei/baby-names\n",
"# https://blog.csdn.net/anmo9499/article/details/101646224\n",
"# https://www.cnblogs.com/zhongbin/p/3273086.html\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}