{ "cells": [ { "cell_type": "code", "execution_count": 20, "id": "03589aa7-7fc4-4fe1-92ac-9f523e2ba90c", "metadata": {}, "outputs": [], "source": [ "# print ascii encoding of a character" ] }, { "cell_type": "code", "execution_count": 65, "id": "e068c988-0a8a-45d8-ac64-ea07cf37b275", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"'\\\\u4f60'\"" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ascii(\"你\")" ] }, { "cell_type": "code", "execution_count": 66, "id": "0760b0c9-17f7-4fca-87a1-2c3423aaf61f", "metadata": {}, "outputs": [], "source": [ "# what is this ascii standard?" ] }, { "cell_type": "code", "execution_count": 67, "id": "44976d3e-c6a0-4d2e-8e3a-9d21ac254803", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"'a'\"" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ascii(\"a\")" ] }, { "cell_type": "code", "execution_count": 68, "id": "8ffd857f-e182-49d0-9722-f8d2204f9054", "metadata": {}, "outputs": [], "source": [ "# try ord " ] }, { "cell_type": "code", "execution_count": 69, "id": "97c7c7d1-b79f-42ac-8d00-81c12b428281", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "97" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ord(\"a\")" ] }, { "cell_type": "code", "execution_count": 70, "id": "9477f26f-3e9c-4a4c-be1b-1c135cd3688e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "65" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ord(\"A\")" ] }, { "cell_type": "code", "execution_count": 71, "id": "f0bb5070-d333-46c2-bdea-da6404989ba3", "metadata": {}, "outputs": [], "source": [ "#Print alphabet lower and upper case\n", "alphabet = [\"a\",\"b\",\"c\",\"d\",\"e\",\"f\",\"g\",\"h\",\"i\",\"j\",\"k\",\"l\",\"m\",\"n\",\"o\",\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n", "ALPHABET = []\n", "for letter in alphabet: ALPHABET.append(letter.upper())" ] }, { "cell_type": "code", "execution_count": 72, "id": "b53660cc-cee1-4d69-8498-c94edd84e5aa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Printing corresponding ASCII numerical representation :\n", "a :97 b :98 c :99 d :100 e :101 f :102 g :103 h :104 i :105 j :106 k :107 l :108 m :109 n :110 o :111 p :112 q :113 r :114 s :115 t :116 u :117 v :118 w :119 x :120 y :121 z :122 " ] } ], "source": [ "print(\"Printing corresponding ASCII numerical representation :\")\n", "for letter in alphabet: \n", " print(\"{} :\".format (letter) + str (ord(letter)), end = \" \")" ] }, { "cell_type": "code", "execution_count": 73, "id": "4f7ec4ff-b6e6-4cda-b8a3-6e2a3521d9c8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Printing corresponding ASCII numerical representation :\n", "A :65 B :66 C :67 D :68 E :69 F :70 G :71 H :72 I :73 J :74 K :75 L :76 M :77 N :78 O :79 P :80 Q :81 R :82 S :83 T :84 U :85 V :86 W :87 X :88 Y :89 Z :90 " ] } ], "source": [ "print(\"Printing corresponding ASCII numerical representation :\")\n", "for LETTER in ALPHABET: \n", " print(\"{} :\".format (LETTER) + str (ord(LETTER)), end = \" \")" ] }, { "cell_type": "code", "execution_count": 74, "id": "87963ac8-050c-44a5-9f5a-af94e5ba40ce", "metadata": {}, "outputs": [], "source": [ "# the lower case letter representing the same letter is +32 of the capital letter, and thus the following hack \n", "# without using the upper() method in Python" ] }, { "cell_type": "code", "execution_count": 75, "id": "6318993c-1f92-4aa5-befd-2e01963bda15", "metadata": {}, "outputs": [], "source": [ "# takes input of a lower case char\n", "def upperHack(c):\n", " # get number of the lower case letter\n", " l_num = ord(c)\n", " u_num = l_num - 32\n", " # return the upper case letter from u_num with chr()method\n", " return chr(u_num)" ] }, { "cell_type": "code", "execution_count": 76, "id": "b47bae87-4e21-4a9c-a90a-6f999df2ac95", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'C'" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "upperHack(\"c\")" ] }, { "cell_type": "code", "execution_count": 77, "id": "f0c628f3-e92c-446d-81c4-9cc7149b3a1c", "metadata": {}, "outputs": [], "source": [ "# a vice versa method\n", "def lowerHack(C):\n", " # get number of the lower case letter\n", " u_num = ord(C)\n", " l_num = u_num + 32\n", " # return the upper case letter from u_num with chr()method\n", " return chr(l_num)" ] }, { "cell_type": "code", "execution_count": 78, "id": "46a9f58f-5428-4901-85f4-73542d8110fd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'c'" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lowerHack(\"C\")" ] }, { "cell_type": "code", "execution_count": 79, "id": "a467b7a2-1ea5-4148-bfa4-4424ec110465", "metadata": {}, "outputs": [], "source": [ "# 好,现在我们把西文的方法应用到中文里去\n", "# 拉丁语系下的西文中的词由字母构成;中文里的字由笔划构成。\n", "# 我们先从系统层面思考:笔划是构造单个汉字的材料,在python中可以实现对笔划的探索性编程(exploratory programming)\n", "# 或者,我们可以在“语料库”的系统层面思考。\n", "#《千字文》中包含了一千个不重复的汉字,我们可以理解为这是古人的“识字课本”。\n", "# 汉字和千字文之间个体与系统的关系,好比“永字八法”中,笔划和“永”字之间的关系。\n", "# 以王羲之的永字为范本,学童反复摹习,掌握“永”字中的“侧、勒、弩、趯、策、掠、啄、磔”,以期掌握写好每一个汉字。\n", "# 同样,以千字文为范本,学童反复背诵研习,为了识字,并以“字”为单位行文。" ] }, { "cell_type": "code", "execution_count": 80, "id": "92840cd9-b859-4c27-9016-80608168e448", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "天地玄黄,宇宙洪荒。日月盈昃,辰宿列张。4\n", "寒来暑往,秋收冬藏。闰余成岁,律吕调阳。8\n", "云腾致雨,露结为霜。金生丽水,玉出昆冈。12\n", "剑号巨阙,珠称夜光。果珍李柰,菜重芥姜。16\n", "海咸河淡,鳞潜羽翔。龙师火帝,鸟官人皇。20\n", "始制文字,乃服衣裳。推位让国,有虞陶唐。24\n", "吊民伐罪,周发殷汤。坐朝问道,垂拱平章。28\n", "爱育黎首,臣伏戎羌。遐迩壹体,率宾归王。32\n", "鸣凤在树,白驹食场。化被草木,赖及万方。36\n", "盖此身发,四大五常。恭惟鞠养,岂敢毁伤。40\n", "女慕贞洁,男效才良。知过必改,得能莫忘。44\n", "罔谈彼短,靡恃己长。信使可覆,器欲难量。48\n", "墨悲丝染,诗赞羔羊。景行维贤,克念作圣。52\n", "德建名立,形端表正。空谷传声,虚堂习听。56\n", "祸因恶积,福缘善庆。尺璧非宝,寸阴是竞。60\n", "资父事君,曰严与敬。孝当竭力,忠则尽命。64\n", "临深履薄,夙兴温凊。似兰斯馨,如松之盛。68\n", "川流不息,渊澄取映。容止若思,言辞安定。72\n", "笃初诚美,慎终宜令。荣业所基,藉甚无竟。76\n", "学优登仕,摄职从政。存以甘棠,去而益咏。80\n", "乐殊贵贱,礼别尊卑。上和下睦,夫唱妇随。84\n", "外受傅训,入奉母仪。诸姑伯叔,犹子比儿。88\n", "孔怀兄弟,同气连枝。交友投分,切磨箴规。92\n", "仁慈隐恻,造次弗离。节义廉退,颠沛匪亏。96\n", "性静情逸,心动神疲。守真志满,逐物意移。100\n", "坚持雅操,好爵自縻。都邑华夏,东西二京。104\n", "背邙面洛,浮渭据泾。宫殿盘郁,楼观飞惊。108\n", "图写禽兽,画彩仙灵。丙舍傍启,甲帐对楹。112\n", "肆筵设席,鼓瑟吹笙。升阶纳陛,弁转疑星。116\n", "右通广内,左达承明。既集坟典,亦聚群英。120\n", "杜稿钟隶,漆书壁经。府罗将相,路侠槐卿。124\n", "户封八县,家给千兵。高冠陪辇,驱毂振缨。128\n", "世禄侈富,车驾肥轻。策功茂实,勒碑刻铭。132\n", "磻溪伊尹,佐时阿衡。奄宅曲阜,微旦孰营。136\n", "桓公匡合,济弱扶倾。绮回汉惠,说感武丁。140\n", "俊乂密勿,多士实宁。晋楚更霸,赵魏困横。144\n", "假途灭虢,践土会盟。何遵约法,韩弊烦刑。148\n", "起翦颇牧,用军最精。宣威沙漠,驰誉丹青。152\n", "九州禹迹,百郡秦并。岳宗恒岱,禅主云亭。156\n", "雁门紫塞,鸡田赤城。昆池碣石,巨野洞庭。160\n", "旷远绵邈,岩岫杳冥。治本于农,务兹稼穑。164\n", "俶载南亩,我艺黍稷。税熟贡新,劝赏黜陟。168\n", "孟轲敦素,史鱼秉直。庶几中庸,劳谦谨敕。172\n", "聆音察理,鉴貌辨色。贻厥嘉猷,勉其祗植。176\n", "省躬讥诫,宠增抗极。殆辱近耻,林皋幸即。180\n", "两疏见机,解组谁逼。索居闲处,沉默寂寥。184\n", "求古寻论,散虑逍遥。欣奏累遣,戚谢欢招。188\n", "渠荷的历,园莽抽条。枇杷晚翠,梧桐早凋。192\n", "陈根委翳,落叶飘摇。游鹍独运,凌摩绛霄。196\n", "耽读玩市,寓目囊箱。易輶攸畏,属耳垣墙。200\n", "具膳餐饭,适口充肠。饱饫烹宰,饥厌糟糠。204\n", "亲戚故旧,老少异粮。妾御绩纺,侍巾帷房。208\n", "纨扇圆洁,银烛炜煌。昼眠夕寐,蓝笋象床。212\n", "弦歌酒宴,接杯举觞。矫手顿足,悦豫且康。216\n", "嫡后嗣续,祭祀烝尝。稽颡再拜,悚惧恐惶。220\n", "笺牒简要,顾答审详。骸垢想浴,执热愿凉。224\n", "驴骡犊特,骇跃超骧。诛斩贼盗,捕获叛亡。228\n", "布射辽丸,嵇琴阮啸。恬笔伦纸,钧巧任钓。232\n", "释纷利俗,并皆佳妙。毛施淑姿,工颦妍笑。236\n", "年矢每催,曦晖朗曜。璇玑悬斡,晦魄环照。240\n", "指薪修祜,永绥吉劭。矩步引领,俯仰廊庙。244\n", "束带矜庄,徘徊瞻眺。孤陋寡闻,愚蒙等诮。248\n", "谓语助者,焉哉乎也。250\n" ] } ], "source": [ "# read in \"A Thousand Character Essay\"\n", "thousand_w = open(\"files/thousand_char_essay.txt\",\"r\")\n", "corpus = thousand_w.read(None)\n", "print(corpus)" ] }, { "cell_type": "code", "execution_count": 81, "id": "23892e53-9d2e-4d93-b98f-e4fb08ad910c", "metadata": {}, "outputs": [], "source": [ "# remove counter numbers from corpus \n", "clean_corpus=[]\n", "for word in corpus:\n", " if word.isdigit() is not True:\n", " clean_corpus.append(word)" ] }, { "cell_type": "code", "execution_count": 82, "id": "b6b6892e-6b4f-4d54-a466-c2c5ec04f4c9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "天地玄黄,宇宙洪荒。日月盈昃,辰宿列张。\n", "寒来暑往,秋收冬藏。闰余成岁,律吕调阳。\n", "云腾致雨,露结为霜。金生丽水,玉出昆冈。\n", "剑号巨阙,珠称夜光。果珍李柰,菜重芥姜。\n", "海咸河淡,鳞潜羽翔。龙师火帝,鸟官人皇。\n", "始制文字,乃服衣裳。推位让国,有虞陶唐。\n", "吊民伐罪,周发殷汤。坐朝问道,垂拱平章。\n", "爱育黎首,臣伏戎羌。遐迩壹体,率宾归王。\n", "鸣凤在树,白驹食场。化被草木,赖及万方。\n", "盖此身发,四大五常。恭惟鞠养,岂敢毁伤。\n", "女慕贞洁,男效才良。知过必改,得能莫忘。\n", "罔谈彼短,靡恃己长。信使可覆,器欲难量。\n", "墨悲丝染,诗赞羔羊。景行维贤,克念作圣。\n", "德建名立,形端表正。空谷传声,虚堂习听。\n", "祸因恶积,福缘善庆。尺璧非宝,寸阴是竞。\n", "资父事君,曰严与敬。孝当竭力,忠则尽命。\n", "临深履薄,夙兴温凊。似兰斯馨,如松之盛。\n", "川流不息,渊澄取映。容止若思,言辞安定。\n", "笃初诚美,慎终宜令。荣业所基,藉甚无竟。\n", "学优登仕,摄职从政。存以甘棠,去而益咏。\n", "乐殊贵贱,礼别尊卑。上和下睦,夫唱妇随。\n", "外受傅训,入奉母仪。诸姑伯叔,犹子比儿。\n", "孔怀兄弟,同气连枝。交友投分,切磨箴规。\n", "仁慈隐恻,造次弗离。节义廉退,颠沛匪亏。\n", "性静情逸,心动神疲。守真志满,逐物意移。\n", "坚持雅操,好爵自縻。都邑华夏,东西二京。\n", "背邙面洛,浮渭据泾。宫殿盘郁,楼观飞惊。\n", "图写禽兽,画彩仙灵。丙舍傍启,甲帐对楹。\n", "肆筵设席,鼓瑟吹笙。升阶纳陛,弁转疑星。\n", "右通广内,左达承明。既集坟典,亦聚群英。\n", "杜稿钟隶,漆书壁经。府罗将相,路侠槐卿。\n", "户封八县,家给千兵。高冠陪辇,驱毂振缨。\n", "世禄侈富,车驾肥轻。策功茂实,勒碑刻铭。\n", "磻溪伊尹,佐时阿衡。奄宅曲阜,微旦孰营。\n", "桓公匡合,济弱扶倾。绮回汉惠,说感武丁。\n", "俊乂密勿,多士实宁。晋楚更霸,赵魏困横。\n", "假途灭虢,践土会盟。何遵约法,韩弊烦刑。\n", "起翦颇牧,用军最精。宣威沙漠,驰誉丹青。\n", "九州禹迹,百郡秦并。岳宗恒岱,禅主云亭。\n", "雁门紫塞,鸡田赤城。昆池碣石,巨野洞庭。\n", "旷远绵邈,岩岫杳冥。治本于农,务兹稼穑。\n", "俶载南亩,我艺黍稷。税熟贡新,劝赏黜陟。\n", "孟轲敦素,史鱼秉直。庶几中庸,劳谦谨敕。\n", "聆音察理,鉴貌辨色。贻厥嘉猷,勉其祗植。\n", "省躬讥诫,宠增抗极。殆辱近耻,林皋幸即。\n", "两疏见机,解组谁逼。索居闲处,沉默寂寥。\n", "求古寻论,散虑逍遥。欣奏累遣,戚谢欢招。\n", "渠荷的历,园莽抽条。枇杷晚翠,梧桐早凋。\n", "陈根委翳,落叶飘摇。游鹍独运,凌摩绛霄。\n", "耽读玩市,寓目囊箱。易輶攸畏,属耳垣墙。\n", "具膳餐饭,适口充肠。饱饫烹宰,饥厌糟糠。\n", "亲戚故旧,老少异粮。妾御绩纺,侍巾帷房。\n", "纨扇圆洁,银烛炜煌。昼眠夕寐,蓝笋象床。\n", "弦歌酒宴,接杯举觞。矫手顿足,悦豫且康。\n", "嫡后嗣续,祭祀烝尝。稽颡再拜,悚惧恐惶。\n", "笺牒简要,顾答审详。骸垢想浴,执热愿凉。\n", "驴骡犊特,骇跃超骧。诛斩贼盗,捕获叛亡。\n", "布射辽丸,嵇琴阮啸。恬笔伦纸,钧巧任钓。\n", "释纷利俗,并皆佳妙。毛施淑姿,工颦妍笑。\n", "年矢每催,曦晖朗曜。璇玑悬斡,晦魄环照。\n", "指薪修祜,永绥吉劭。矩步引领,俯仰廊庙。\n", "束带矜庄,徘徊瞻眺。孤陋寡闻,愚蒙等诮。\n", "谓语助者,焉哉乎也。\n" ] } ], "source": [ "# print(clean_corpus)\n", "clean_corpus_s = \" \"\n", "clean_corpus_s = \"\".join(str(x) for x in clean_corpus)\n", "print(clean_corpus_s)" ] }, { "cell_type": "code", "execution_count": 83, "id": "34dfbc8f-df1b-4969-a5fb-d073a08c9a32", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'\\u5929''\\u5730''\\u7384''\\u9ec4','\\u5b87''\\u5b99''\\u6d2a''\\u8352'。'\\u65e5''\\u6708''\\u76c8''\\u6603','\\u8fb0''\\u5bbf''\\u5217''\\u5f20'。\n", "'\\u5bd2''\\u6765''\\u6691''\\u5f80','\\u79cb''\\u6536''\\u51ac''\\u85cf'。'\\u95f0''\\u4f59''\\u6210''\\u5c81','\\u5f8b''\\u5415''\\u8c03''\\u9633'。\n", "'\\u4e91''\\u817e''\\u81f4''\\u96e8','\\u9732''\\u7ed3''\\u4e3a''\\u971c'。'\\u91d1''\\u751f''\\u4e3d''\\u6c34','\\u7389''\\u51fa''\\u6606''\\u5188'。\n", "'\\u5251''\\u53f7''\\u5de8''\\u9619','\\u73e0''\\u79f0''\\u591c''\\u5149'。'\\u679c''\\u73cd''\\u674e''\\u67f0','\\u83dc''\\u91cd''\\u82a5''\\u59dc'。\n", "'\\u6d77''\\u54b8''\\u6cb3''\\u6de1','\\u9cde''\\u6f5c''\\u7fbd''\\u7fd4'。'\\u9f99''\\u5e08''\\u706b''\\u5e1d','\\u9e1f''\\u5b98''\\u4eba''\\u7687'。\n", "'\\u59cb''\\u5236''\\u6587''\\u5b57','\\u4e43''\\u670d''\\u8863''\\u88f3'。'\\u63a8''\\u4f4d''\\u8ba9''\\u56fd','\\u6709''\\u865e''\\u9676''\\u5510'。\n", "'\\u540a''\\u6c11''\\u4f10''\\u7f6a','\\u5468''\\u53d1''\\u6bb7''\\u6c64'。'\\u5750''\\u671d''\\u95ee''\\u9053','\\u5782''\\u62f1''\\u5e73''\\u7ae0'。\n", "'\\u7231''\\u80b2''\\u9ece''\\u9996','\\u81e3''\\u4f0f''\\u620e''\\u7f8c'。'\\u9050''\\u8fe9''\\u58f9''\\u4f53','\\u7387''\\u5bbe''\\u5f52''\\u738b'。\n", "'\\u9e23''\\u51e4''\\u5728''\\u6811','\\u767d''\\u9a79''\\u98df''\\u573a'。'\\u5316''\\u88ab''\\u8349''\\u6728','\\u8d56''\\u53ca''\\u4e07''\\u65b9'。\n", "'\\u76d6''\\u6b64''\\u8eab''\\u53d1','\\u56db''\\u5927''\\u4e94''\\u5e38'。'\\u606d''\\u60df''\\u97a0''\\u517b','\\u5c82''\\u6562''\\u6bc1''\\u4f24'。\n", "'\\u5973''\\u6155''\\u8d1e''\\u6d01','\\u7537''\\u6548''\\u624d''\\u826f'。'\\u77e5''\\u8fc7''\\u5fc5''\\u6539','\\u5f97''\\u80fd''\\u83ab''\\u5fd8'。\n", "'\\u7f54''\\u8c08''\\u5f7c''\\u77ed','\\u9761''\\u6043''\\u5df1''\\u957f'。'\\u4fe1''\\u4f7f''\\u53ef''\\u8986','\\u5668''\\u6b32''\\u96be''\\u91cf'。\n", "'\\u58a8''\\u60b2''\\u4e1d''\\u67d3','\\u8bd7''\\u8d5e''\\u7f94''\\u7f8a'。'\\u666f''\\u884c''\\u7ef4''\\u8d24','\\u514b''\\u5ff5''\\u4f5c''\\u5723'。\n", "'\\u5fb7''\\u5efa''\\u540d''\\u7acb','\\u5f62''\\u7aef''\\u8868''\\u6b63'。'\\u7a7a''\\u8c37''\\u4f20''\\u58f0','\\u865a''\\u5802''\\u4e60''\\u542c'。\n", "'\\u7978''\\u56e0''\\u6076''\\u79ef','\\u798f''\\u7f18''\\u5584''\\u5e86'。'\\u5c3a''\\u74a7''\\u975e''\\u5b9d','\\u5bf8''\\u9634''\\u662f''\\u7ade'。\n", "'\\u8d44''\\u7236''\\u4e8b''\\u541b','\\u66f0''\\u4e25''\\u4e0e''\\u656c'。'\\u5b5d''\\u5f53''\\u7aed''\\u529b','\\u5fe0''\\u5219''\\u5c3d''\\u547d'。\n", "'\\u4e34''\\u6df1''\\u5c65''\\u8584','\\u5919''\\u5174''\\u6e29''\\u51ca'。'\\u4f3c''\\u5170''\\u65af''\\u99a8','\\u5982''\\u677e''\\u4e4b''\\u76db'。\n", "'\\u5ddd''\\u6d41''\\u4e0d''\\u606f','\\u6e0a''\\u6f84''\\u53d6''\\u6620'。'\\u5bb9''\\u6b62''\\u82e5''\\u601d','\\u8a00''\\u8f9e''\\u5b89''\\u5b9a'。\n", "'\\u7b03''\\u521d''\\u8bda''\\u7f8e','\\u614e''\\u7ec8''\\u5b9c''\\u4ee4'。'\\u8363''\\u4e1a''\\u6240''\\u57fa','\\u85c9''\\u751a''\\u65e0''\\u7adf'。\n", "'\\u5b66''\\u4f18''\\u767b''\\u4ed5','\\u6444''\\u804c''\\u4ece''\\u653f'。'\\u5b58''\\u4ee5''\\u7518''\\u68e0','\\u53bb''\\u800c''\\u76ca''\\u548f'。\n", "'\\u4e50''\\u6b8a''\\u8d35''\\u8d31','\\u793c''\\u522b''\\u5c0a''\\u5351'。'\\u4e0a''\\u548c''\\u4e0b''\\u7766','\\u592b''\\u5531''\\u5987''\\u968f'。\n", "'\\u5916''\\u53d7''\\u5085''\\u8bad','\\u5165''\\u5949''\\u6bcd''\\u4eea'。'\\u8bf8''\\u59d1''\\u4f2f''\\u53d4','\\u72b9''\\u5b50''\\u6bd4''\\u513f'。\n", "'\\u5b54''\\u6000''\\u5144''\\u5f1f','\\u540c''\\u6c14''\\u8fde''\\u679d'。'\\u4ea4''\\u53cb''\\u6295''\\u5206','\\u5207''\\u78e8''\\u7bb4''\\u89c4'。\n", "'\\u4ec1''\\u6148''\\u9690''\\u607b','\\u9020''\\u6b21''\\u5f17''\\u79bb'。'\\u8282''\\u4e49''\\u5ec9''\\u9000','\\u98a0''\\u6c9b''\\u532a''\\u4e8f'。\n", "'\\u6027''\\u9759''\\u60c5''\\u9038','\\u5fc3''\\u52a8''\\u795e''\\u75b2'。'\\u5b88''\\u771f''\\u5fd7''\\u6ee1','\\u9010''\\u7269''\\u610f''\\u79fb'。\n", "'\\u575a''\\u6301''\\u96c5''\\u64cd','\\u597d''\\u7235''\\u81ea''\\u7e3b'。'\\u90fd''\\u9091''\\u534e''\\u590f','\\u4e1c''\\u897f''\\u4e8c''\\u4eac'。\n", "'\\u80cc''\\u9099''\\u9762''\\u6d1b','\\u6d6e''\\u6e2d''\\u636e''\\u6cfe'。'\\u5bab''\\u6bbf''\\u76d8''\\u90c1','\\u697c''\\u89c2''\\u98de''\\u60ca'。\n", "'\\u56fe''\\u5199''\\u79bd''\\u517d','\\u753b''\\u5f69''\\u4ed9''\\u7075'。'\\u4e19''\\u820d''\\u508d''\\u542f','\\u7532''\\u5e10''\\u5bf9''\\u6979'。\n", "'\\u8086''\\u7b75''\\u8bbe''\\u5e2d','\\u9f13''\\u745f''\\u5439''\\u7b19'。'\\u5347''\\u9636''\\u7eb3''\\u965b','\\u5f01''\\u8f6c''\\u7591''\\u661f'。\n", "'\\u53f3''\\u901a''\\u5e7f''\\u5185','\\u5de6''\\u8fbe''\\u627f''\\u660e'。'\\u65e2''\\u96c6''\\u575f''\\u5178','\\u4ea6''\\u805a''\\u7fa4''\\u82f1'。\n", "'\\u675c''\\u7a3f''\\u949f''\\u96b6','\\u6f06''\\u4e66''\\u58c1''\\u7ecf'。'\\u5e9c''\\u7f57''\\u5c06''\\u76f8','\\u8def''\\u4fa0''\\u69d0''\\u537f'。\n", "'\\u6237''\\u5c01''\\u516b''\\u53bf','\\u5bb6''\\u7ed9''\\u5343''\\u5175'。'\\u9ad8''\\u51a0''\\u966a''\\u8f87','\\u9a71''\\u6bc2''\\u632f''\\u7f28'。\n", "'\\u4e16''\\u7984''\\u4f88''\\u5bcc','\\u8f66''\\u9a7e''\\u80a5''\\u8f7b'。'\\u7b56''\\u529f''\\u8302''\\u5b9e','\\u52d2''\\u7891''\\u523b''\\u94ed'。\n", "'\\u78fb''\\u6eaa''\\u4f0a''\\u5c39','\\u4f50''\\u65f6''\\u963f''\\u8861'。'\\u5944''\\u5b85''\\u66f2''\\u961c','\\u5fae''\\u65e6''\\u5b70''\\u8425'。\n", "'\\u6853''\\u516c''\\u5321''\\u5408','\\u6d4e''\\u5f31''\\u6276''\\u503e'。'\\u7eee''\\u56de''\\u6c49''\\u60e0','\\u8bf4''\\u611f''\\u6b66''\\u4e01'。\n", "'\\u4fca''\\u4e42''\\u5bc6''\\u52ff','\\u591a''\\u58eb''\\u5b9e''\\u5b81'。'\\u664b''\\u695a''\\u66f4''\\u9738','\\u8d75''\\u9b4f''\\u56f0''\\u6a2a'。\n", "'\\u5047''\\u9014''\\u706d''\\u8662','\\u8df5''\\u571f''\\u4f1a''\\u76df'。'\\u4f55''\\u9075''\\u7ea6''\\u6cd5','\\u97e9''\\u5f0a''\\u70e6''\\u5211'。\n", "'\\u8d77''\\u7fe6''\\u9887''\\u7267','\\u7528''\\u519b''\\u6700''\\u7cbe'。'\\u5ba3''\\u5a01''\\u6c99''\\u6f20','\\u9a70''\\u8a89''\\u4e39''\\u9752'。\n", "'\\u4e5d''\\u5dde''\\u79b9''\\u8ff9','\\u767e''\\u90e1''\\u79e6''\\u5e76'。'\\u5cb3''\\u5b97''\\u6052''\\u5cb1','\\u7985''\\u4e3b''\\u4e91''\\u4ead'。\n", "'\\u96c1''\\u95e8''\\u7d2b''\\u585e','\\u9e21''\\u7530''\\u8d64''\\u57ce'。'\\u6606''\\u6c60''\\u78a3''\\u77f3','\\u5de8''\\u91ce''\\u6d1e''\\u5ead'。\n", "'\\u65f7''\\u8fdc''\\u7ef5''\\u9088','\\u5ca9''\\u5cab''\\u6773''\\u51a5'。'\\u6cbb''\\u672c''\\u4e8e''\\u519c','\\u52a1''\\u5179''\\u7a3c''\\u7a51'。\n", "'\\u4ff6''\\u8f7d''\\u5357''\\u4ea9','\\u6211''\\u827a''\\u9ecd''\\u7a37'。'\\u7a0e''\\u719f''\\u8d21''\\u65b0','\\u529d''\\u8d4f''\\u9edc''\\u965f'。\n", "'\\u5b5f''\\u8f72''\\u6566''\\u7d20','\\u53f2''\\u9c7c''\\u79c9''\\u76f4'。'\\u5eb6''\\u51e0''\\u4e2d''\\u5eb8','\\u52b3''\\u8c26''\\u8c28''\\u6555'。\n", "'\\u8046''\\u97f3''\\u5bdf''\\u7406','\\u9274''\\u8c8c''\\u8fa8''\\u8272'。'\\u8d3b''\\u53a5''\\u5609''\\u7337','\\u52c9''\\u5176''\\u7957''\\u690d'。\n", "'\\u7701''\\u8eac''\\u8ba5''\\u8beb','\\u5ba0''\\u589e''\\u6297''\\u6781'。'\\u6b86''\\u8fb1''\\u8fd1''\\u803b','\\u6797''\\u768b''\\u5e78''\\u5373'。\n", "'\\u4e24''\\u758f''\\u89c1''\\u673a','\\u89e3''\\u7ec4''\\u8c01''\\u903c'。'\\u7d22''\\u5c45''\\u95f2''\\u5904','\\u6c89''\\u9ed8''\\u5bc2''\\u5be5'。\n", "'\\u6c42''\\u53e4''\\u5bfb''\\u8bba','\\u6563''\\u8651''\\u900d''\\u9065'。'\\u6b23''\\u594f''\\u7d2f''\\u9063','\\u621a''\\u8c22''\\u6b22''\\u62db'。\n", "'\\u6e20''\\u8377''\\u7684''\\u5386','\\u56ed''\\u83bd''\\u62bd''\\u6761'。'\\u6787''\\u6777''\\u665a''\\u7fe0','\\u68a7''\\u6850''\\u65e9''\\u51cb'。\n", "'\\u9648''\\u6839''\\u59d4''\\u7ff3','\\u843d''\\u53f6''\\u98d8''\\u6447'。'\\u6e38''\\u9e4d''\\u72ec''\\u8fd0','\\u51cc''\\u6469''\\u7edb''\\u9704'。\n", "'\\u803d''\\u8bfb''\\u73a9''\\u5e02','\\u5bd3''\\u76ee''\\u56ca''\\u7bb1'。'\\u6613''\\u8f36''\\u6538''\\u754f','\\u5c5e''\\u8033''\\u57a3''\\u5899'。\n", "'\\u5177''\\u81b3''\\u9910''\\u996d','\\u9002''\\u53e3''\\u5145''\\u80a0'。'\\u9971''\\u996b''\\u70f9''\\u5bb0','\\u9965''\\u538c''\\u7cdf''\\u7ce0'。\n", "'\\u4eb2''\\u621a''\\u6545''\\u65e7','\\u8001''\\u5c11''\\u5f02''\\u7cae'。'\\u59be''\\u5fa1''\\u7ee9''\\u7eba','\\u4f8d''\\u5dfe''\\u5e37''\\u623f'。\n", "'\\u7ea8''\\u6247''\\u5706''\\u6d01','\\u94f6''\\u70db''\\u709c''\\u714c'。'\\u663c''\\u7720''\\u5915''\\u5bd0','\\u84dd''\\u7b0b''\\u8c61''\\u5e8a'。\n", "'\\u5f26''\\u6b4c''\\u9152''\\u5bb4','\\u63a5''\\u676f''\\u4e3e''\\u89de'。'\\u77eb''\\u624b''\\u987f''\\u8db3','\\u60a6''\\u8c6b''\\u4e14''\\u5eb7'。\n", "'\\u5ae1''\\u540e''\\u55e3''\\u7eed','\\u796d''\\u7940''\\u70dd''\\u5c1d'。'\\u7a3d''\\u98a1''\\u518d''\\u62dc','\\u609a''\\u60e7''\\u6050''\\u60f6'。\n", "'\\u7b3a''\\u7252''\\u7b80''\\u8981','\\u987e''\\u7b54''\\u5ba1''\\u8be6'。'\\u9ab8''\\u57a2''\\u60f3''\\u6d74','\\u6267''\\u70ed''\\u613f''\\u51c9'。\n", "'\\u9a74''\\u9aa1''\\u728a''\\u7279','\\u9a87''\\u8dc3''\\u8d85''\\u9aa7'。'\\u8bdb''\\u65a9''\\u8d3c''\\u76d7','\\u6355''\\u83b7''\\u53db''\\u4ea1'。\n", "'\\u5e03''\\u5c04''\\u8fbd''\\u4e38','\\u5d47''\\u7434''\\u962e''\\u5578'。'\\u606c''\\u7b14''\\u4f26''\\u7eb8','\\u94a7''\\u5de7''\\u4efb''\\u9493'。\n", "'\\u91ca''\\u7eb7''\\u5229''\\u4fd7','\\u5e76''\\u7686''\\u4f73''\\u5999'。'\\u6bdb''\\u65bd''\\u6dd1''\\u59ff','\\u5de5''\\u98a6''\\u598d''\\u7b11'。\n", "'\\u5e74''\\u77e2''\\u6bcf''\\u50ac','\\u66e6''\\u6656''\\u6717''\\u66dc'。'\\u7487''\\u7391''\\u60ac''\\u65a1','\\u6666''\\u9b44''\\u73af''\\u7167'。\n", "'\\u6307''\\u85aa''\\u4fee''\\u795c','\\u6c38''\\u7ee5''\\u5409''\\u52ad'。'\\u77e9''\\u6b65''\\u5f15''\\u9886','\\u4fef''\\u4ef0''\\u5eca''\\u5e99'。\n", "'\\u675f''\\u5e26''\\u77dc''\\u5e84','\\u5f98''\\u5f8a''\\u77bb''\\u773a'。'\\u5b64''\\u964b''\\u5be1''\\u95fb','\\u611a''\\u8499''\\u7b49''\\u8bee'。\n", "'\\u8c13''\\u8bed''\\u52a9''\\u8005','\\u7109''\\u54c9''\\u4e4e''\\u4e5f'。\n" ] } ], "source": [ "import re \n", "# print the ascii number of each text\n", "punctuation = [\",\",\"。\"]\n", "line_break = \"\\n\"\n", "ascii_corpus = \"\"\n", "ascii_num = \"\"\n", "\n", "#ascii_corpus = \"\".join(x for x in clean_corpus)\n", "\n", "\n", "for character in clean_corpus_s:\n", " #print puntuation as is\n", " if character in punctuation: \n", " #print(character)\n", " # TODO remove unicode \\u\n", " ascii_corpus += character\n", " elif character == line_break:\n", " pass\n", " ascii_corpus += \"\\n\"\n", " else:\n", " ascii_num = ascii(character)\n", " #print(ascii_num)\n", " ascii_corpus += ascii_num\n", "print(ascii_corpus)\n" ] }, { "cell_type": "code", "execution_count": 84, "id": "78e14928-2ac6-48ca-be8d-1ecc73f6603d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8312" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# write corpus to a new file\n", "ascii_output = open(\"files/ascii_output.txt\",\"w\")\n", "ascii_output.write(ascii_corpus)" ] }, { "cell_type": "code", "execution_count": 62, "id": "b930f8f8-33d8-4856-9704-064bda5e88a9", "metadata": {}, "outputs": [], "source": [ "# more unicode howto\n", "# https://docs.python.org/3/howto/unicode.html" ] }, { "cell_type": "code", "execution_count": 63, "id": "ee668d04-d1b5-4c6e-8868-f500015c8934", "metadata": {}, "outputs": [], "source": [ "# preserve the original punctuation and layout\n", "# need to turn the list back into a string to preserve string layout? " ] }, { "cell_type": "code", "execution_count": 64, "id": "862091df-0d0b-449a-a402-e3ecf007c252", "metadata": {}, "outputs": [], "source": [ "thousand_w.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "3736d5e5-c332-4ab7-8333-9c1efd4e62c8", "metadata": {}, "outputs": [], "source": [ "# https://github.com/callmefeifei/baby-names\n", "# https://blog.csdn.net/anmo9499/article/details/101646224\n", "# https://www.cnblogs.com/zhongbin/p/3273086.html\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }