{ "cells": [ { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "def normalize_punctuations(line: str) -> str:\n", " # Replace all English punctuations with Chinese ones\n", " line = line.replace(\",\", \",\").replace(\"!\", \"!\").replace(\"?\", \"?\")\\\n", " .replace(\":\", \":\").replace(\";\", \";\").replace(\"(\", \"(\").replace(\")\", \")\")\n", " return line" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Got 14838 Cantonese sentences with length >= 5\n" ] } ], "source": [ "import re\n", "from functools import reduce\n", "\n", "can_sentence_start = re.compile(r\"[0-9]*hz \")\n", "can_lines = []\n", "\n", "with open(\"train/abc/abc_cantonese_index_00001_to_04587_line_1_to_4575.xml\", \"r\") as abc_file1,\\\n", "open(\"train/abc/abc_cantonese_index_04588_to_09175_line_4576_to_9150.xml\", \"r\") as abc_file2,\\\n", " open(\"train/abc/abc_cantonese_index_09176_to_13775_line_9151_to_13725.xml\", \"r\") as abc_file3,\\\n", " open(\"train/abc/abc_cantonese_index_13776_to_FE99FD5B4E37BE32_line_13726_to_18302.xml\", \"r\") as abc_file4:\n", " lines = reduce(lambda lines, file: lines + file.read().splitlines(), [abc_file1, abc_file2, abc_file3, abc_file4], [])\n", " for line in lines:\n", " match = can_sentence_start.match(line)\n", " if match and not \"(empty band???)\" in line:\n", " line = line[match.end():].strip()\n", " if len(line) >= 5:\n", " can_lines.append(normalize_punctuations(line))\n", "\n", "print(\"Got {} Cantonese sentences with length >= 5\".format(len(can_lines)))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 4527 common Cantonese characters\n" ] } ], "source": [ "common_can_charset = set()\n", "\n", "with open(\"train/wordshk.can\", \"r\") as wordshk_file:\n", " for c in wordshk_file.read():\n", " common_can_charset.add(c)\n", "\n", "print(f\"Found {len(common_can_charset)} common Cantonese characters\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 365 rare Cantonese characters\n", "𠹺 388\n", "噖 162\n", "𡁵 157\n", "𠶧 88\n", "嚫 88\n", "屘 57\n", "衭 47\n", "贃 43\n", "說 35\n", "𧵳 30\n", "歳 27\n", "𢫏 27\n", "𨶙 25\n", "癐 25\n", "𦡆 25\n", "𨃩 24\n", "况 21\n", "内 19\n", "𢵌 19\n", "𦧺 18\n", "𠹌 18\n", "爲 16\n", "𢱑 16\n", "𡁯 15\n", "𠱓 14\n", "𠵿 14\n", "踹 13\n", "㗇 13\n", "𠾴 13\n", "嗍 13\n", "𧘹 13\n", "𠹳 12\n", "𠹭 12\n", "脫 12\n", "䁪 11\n", "𧨾 11\n", "掬 11\n", "𠸐 11\n", "啥 11\n", "𠱃 10\n", "噔 10\n", "捹 10\n", "𠹻 10\n", "𠼻 10\n", "噠 10\n", "𨳊 10\n", "𢲲 9\n", "𨉖 9\n", "躭 9\n", "䠋 9\n", "嘮 9\n", "啽 9\n", "滮 8\n", "㧻 8\n", "𧶄 8\n", "𦛚 8\n", "撠 8\n", "呡 8\n", "睸 8\n", "𠰲 8\n", "𥔿 8\n", "唎 8\n", "𠸊 8\n", "𬜐 8\n", "蔥 8\n", "呱 8\n", "B 7\n", "𢯊 7\n", "𫫃 7\n", "𢝵 7\n", "銹 7\n", "㓤 7\n", "䁯 7\n", "啉 7\n", "臥 7\n", "𠓼 7\n", "稅 7\n", " 7\n", "喴 7\n", "噱 7\n", "衛 6\n", "𡄯 6\n", "揤 6\n", "𢤹 6\n", " 6\n", "鷄 6\n", "湴 6\n", " 6\n", "𦣇 6\n", "齧 6\n", "𠮨 6\n", " 6\n", "𡀝 6\n", "婄 6\n", "𠼱 6\n", "𠱂 5\n", "磧 5\n", "𠰋 5\n", "𡂖 5\n", "浭 5\n", "擏 5\n", "𥋇 5\n", "揢 5\n", "㨆 5\n", "𠾍 5\n", "兌 5\n", "𢺳 5\n", "坺 5\n", "鍚 5\n", "𣘚 5\n", "𪘁 5\n", "𨳍 5\n", "嗙 5\n", "𠼰 5\n", "𨳒 4\n", "唿 4\n", "𣳼 4\n", "𦂥 4\n", "溚 4\n", "囋 4\n", "瀄 4\n", "𠌥 4\n", "𢫦 4\n", "𢶍 4\n", "𠲵 4\n", "䉺 4\n", "炕 4\n", "𢴈 4\n", "𡲢 4\n", "𥅈 4\n", "𬧊 4\n", "簕 4\n", "査 4\n", "𩜠 4\n", "𫬿 4\n", "𠜱 4\n", "嚬 4\n", "𠹹 4\n", "𦉘 4\n", "唦 4\n", "㨘 4\n", "𡄽 4\n", "熗 4\n", "𡁷 4\n", "𠿬 4\n", "咜 4\n", "𠸏 4\n", "𡁸 4\n", "𡃵 4\n", "𪚩 4\n", "D 4\n", "Q 4\n", "𨆯 3\n", "啗 3\n", "蔸 3\n", "舗 3\n", "囪 3\n", "艔 3\n", "洩 3\n", "𢵧 3\n", "菓 3\n", "䪴 3\n", "䆲 3\n", "痱 3\n", "趿 3\n", "𠮩 3\n", "搉 3\n", "矋 3\n", "𠻗 3\n", "𢲈 3\n", "潞 3\n", "沬 3\n", "揇 3\n", "齃 3\n", "𡃤 3\n", "𡃶 3\n", "瀟 3\n", "軨 3\n", "鉻 3\n", " 3\n", "㿭 3\n", "𢵄 3\n", "㗲 3\n", "𢫕 3\n", "𢰸 3\n", "葫 3\n", "咔 3\n", "嚎 3\n", "嗿 3\n", "咈 3\n", "咾 3\n", " 3\n", "𠵈 3\n", "吥 3\n", "𠾭 3\n", "𠾵 3\n", "朘 3\n", "觥 3\n", "㩧 2\n", "焙 2\n", "兀 2\n", "䭤 2\n", "饊 2\n", "[ 2\n", "] 2\n", "炖 2\n", "争 2\n", "䁓 2\n", "𡂝 2\n", "𩬎 2\n", "鈒 2\n", "亁 2\n", "炠 2\n", "摼 2\n", "𠺬 2\n", "𠵉 2\n", "蝄 2\n", " 2\n", "蔫 2\n", "㘉 2\n", "荏 2\n", "墘 2\n", "嗏 2\n", "呣 2\n", "曚 2\n", "壬 2\n", "揅 2\n", "溼 2\n", "囓 2\n", "嚙 2\n", "枴 2\n", "𡃀 2\n", "饑 2\n", "䏭 2\n", "挼 2\n", "掱 2\n", "咑 2\n", "芙 2\n", "𦂗 2\n", "舦 2\n", "𢶤 2\n", "翡 2\n", "翠 2\n", "酡 2\n", "𫭊 2\n", "煀 2\n", "耙 2\n", "𠿭 2\n", "鉤 2\n", "𠻘 2\n", "脽 2\n", "焊 2\n", "唊 2\n", "胅 2\n", "翕 2\n", "摜 2\n", "僚 1\n", "𩗴 1\n", "毡 1\n", "跤 1\n", "梧 1\n", "痄 1\n", "卟 1\n", "劄 1\n", "𠶜 1\n", "睜 1\n", "迹 1\n", "揃 1\n", "唨 1\n", "謢 1\n", "菻 1\n", "𣚺 1\n", "鷓 1\n", "鴣 1\n", "强 1\n", "𠾶 1\n", "𡆀 1\n", "拫 1\n", "𠼮 1\n", "汞 1\n", "㤿 1\n", "厴 1\n", "𥀬 1\n", "牯 1\n", "𡇙 1\n", "讕 1\n", "𠿫 1\n", "瘺 1\n", "骲 1\n", "𫲭 1\n", "瓏 1\n", "繚 1\n", "撿 1\n", "跀 1\n", "𢛴 1\n", "蝻 1\n", "赧 1\n", "𪙛 1\n", " 1\n", "檳 1\n", "潲 1\n", "𢶠 1\n", "秧 1\n", "蒔 1\n", "炩 1\n", "㩋 1\n", "饅 1\n", "鍍 1\n", "𢚖 1\n", "𧊅 1\n", " 1\n", "篸 1\n", "𩟔 1\n", "撍 1\n", "栗 1\n", " 1\n", "𡆇 1\n", "杧 1\n", "榛 1\n", "蠄 1\n", "蟧 1\n", "嘶 1\n", "梆 1\n", "竪 1\n", "騾 1\n", "矺 1\n", "堀 1\n", "麝 1\n", "慪 1\n", "撴 1\n", "哾 1\n", "𠳖 1\n", "洌 1\n", "霹 1\n", "𠾼 1\n", "𬦠 1\n", "𤌍 1\n", "𬧯 1\n", "厠 1\n", "㖡 1\n", "跁 1\n", "鉎 1\n", "𧣈 1\n", "𠳏 1\n", "㹃 1\n", "𧝞 1\n", "𡀞 1\n", "㦒 1\n", "𩩍 1\n", "𢱢 1\n", "鍟 1\n", "煱 1\n", "撘 1\n", "閱 1\n", "橇 1\n", "籽 1\n", "庵 1\n", "厨 1\n", "疴 1\n", "豹 1\n", "杠 1\n", "咘 1\n", "裡 1\n", "熏 1\n", " 1\n" ] } ], "source": [ "from collections import defaultdict\n", "\n", "rare_can_charset = defaultdict(int)\n", "for line in can_lines:\n", " for c in line:\n", " if not c in common_can_charset:\n", " rare_can_charset[c] += 1\n", "\n", "print(f\"Found {len(rare_can_charset)} rare Cantonese characters\")\n", "\n", "charset_sort_by_freq = dict(sorted(rare_can_charset.items(), key=lambda item: -item[1]))\n", "for c, freq in charset_sort_by_freq.items():\n", " print(c, freq)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 12360 normalized mappings\n" ] } ], "source": [ "char_to_normalized_char = {}\n", "\n", "with open(\"zh_char2str_mapping.txt\", \"r\") as input_file:\n", " for line in input_file.read().splitlines():\n", " [c, n] = line.split(\"\\t\")\n", " char_to_normalized_char[c] = n\n", "\n", "print(\"Found {} normalized mappings\".format(len(char_to_normalized_char)))" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "𠹺\t埋\t388\n", "噖\t琴\t162\n", "𡁵\t緊\t157\n", "𠶧\t掂\t88\n", "嚫\t親\t88\n", "屘\t尾\t57\n", "衭\t衤夫\t47\n", "贃\t賺\t43\n", "說\t???\t35\n", "𧵳\t???\t30\n", "歳\t歲\t27\n", "𢫏\t全\t27\n", "𨶙\t能\t25\n", "癐\t???\t25\n", "𦡆\t???\t25\n", "𨃩\t⻊扇\t24\n", "况\t???\t21\n", "内\t內\t19\n", "𢵌\t扌隊\t19\n", "𦧺\t賴\t18\n", "𠹌\t o能\t18\n", "爲\t為\t16\n", "𢱑\t抓\t16\n", "𡁯\t???\t15\n", "𠱓\t詭\t14\n", "𠵿\t披\t14\n", "踹\t???\t13\n", "㗇\t???\t13\n", "𠾴\t棒\t13\n", "嗍\t索\t13\n", "𧘹\t太\t13\n", "𠹳\t傑\t12\n", "𠹭\t???\t12\n", "脫\t???\t12\n", "䁪\t???\t11\n", "𧨾\t氹\t11\n", "掬\t???\t11\n", "𠸐\t???\t11\n", "啥\t???\t11\n", "𠱃\t o凹\t10\n", "噔\t o登\t10\n", "捹\t扌奔\t10\n", "𠹻\t???\t10\n", "𠼻\t基\t10\n", "噠\t???\t10\n", "𨳊\t九\t10\n", "𢲲\t???\t9\n", "𨉖\t???\t9\n", "躭\t耽\t9\n", "䠋\t卑\t9\n", "嘮\t???\t9\n", "啽\t o弇\t9\n", "滮\t氵彪\t8\n", "㧻\t扌涿\t8\n", "𧶄\t???\t8\n", "𦛚\t???\t8\n", "撠\t扌戟\t8\n", "呡\t o吻\t8\n", "睸\t目眉\t8\n", "𠰲\t???\t8\n", "𥔿\t???\t8\n", "唎\t脷\t8\n", "𠸊\t???\t8\n", "𬜐\t???\t8\n", "蔥\t葱\t8\n", "呱\t???\t8\n", "B\t???\t7\n", "𢯊\t扌的\t7\n", "𫫃\t???\t7\n", "𢝵\t???\t7\n", "銹\t鏽\t7\n", "㓤\t吉刂\t7\n", "䁯\t???\t7\n", "啉\t o林\t7\n", "臥\t???\t7\n", "𠓼\t???\t7\n", "稅\t???\t7\n", "\t???\t7\n", "喴\t o威\t7\n", "噱\t???\t7\n", "衛\t???\t6\n", "𡄯\t???\t6\n", "揤\t扌即\t6\n", "𢤹\t???\t6\n", "\t???\t6\n", "鷄\t雞\t6\n", "湴\t氵並\t6\n", "\t???\t6\n", "𦣇\t???\t6\n", "齧\t咬\t6\n", "𠮨\t乃\t6\n", "\t???\t6\n", "𡀝\t???\t6\n", "婄\t蓓\t6\n", "𠼱\t累\t6\n", "𠱂\t???\t5\n", "磧\t石責\t5\n", "𠰋\t???\t5\n", "𡂖\t???\t5\n", "浭\t氵更\t5\n", "擏\t擎\t5\n", "𥋇\t掌\t5\n", "揢\t扌客\t5\n", "㨆\t扌林\t5\n", "𠾍\t棄\t5\n", "兌\t???\t5\n", "𢺳\t???\t5\n", "坺\t土拔\t5\n", "鍚\t???\t5\n", "𣘚\t???\t5\n", "𪘁\t???\t5\n", "𨳍\t七\t5\n", "嗙\t o旁\t5\n", "𠼰\t???\t5\n", "𨳒\t小\t4\n", "唿\t篋\t4\n", "𣳼\t???\t4\n", "𦂥\t???\t4\n", "溚\t塔\t4\n", "囋\t???\t4\n", "瀄\t吱\t4\n", "𠌥\t???\t4\n", "𢫦\t???\t4\n", "𢶍\t???\t4\n", "𠲵\t???\t4\n", "䉺\t米\t4\n", "炕\t???\t4\n", "𢴈\t撻\t4\n", "𡲢\t???\t4\n", "𥅈\t立\t4\n", "𬧊\t???\t4\n", "簕\t勒\t4\n", "査\t查\t4\n", "𩜠\t岩\t4\n", "𫬿\t???\t4\n", "𠜱\t卑刂\t4\n", "嚬\t顰\t4\n", "𠹹\t???\t4\n", "𦉘\t???\t4\n", "唦\t o沙\t4\n", "㨘\t扌省\t4\n", "𡄽\t瀉\t4\n", "熗\t槍\t4\n", "𡁷\t???\t4\n", "𠿬\t???\t4\n", "咜\t叱\t4\n", "𠸏\t茄\t4\n", "𡁸\t???\t4\n", "𡃵\t???\t4\n", "𪚩\t???\t4\n", "D\t???\t4\n", "Q\t???\t4\n", "𨆯\t???\t3\n", "啗\t啖\t3\n", "蔸\t艹兜\t3\n", "舗\t鋪\t3\n", "囪\t窗\t3\n", "艔\t???\t3\n", "洩\t???\t3\n", "𢵧\t???\t3\n", "菓\t果\t3\n", "䪴\t???\t3\n", "䆲\t???\t3\n", "痱\t???\t3\n", "趿\t拖\t3\n", "𠮩\t???\t3\n", "搉\t確\t3\n", "矋\t矖\t3\n", "𠻗\t???\t3\n", "𢲈\t???\t3\n", "潞\t氵路\t3\n", "沬\t???\t3\n", "揇\t扌南\t3\n", "齃\t曷\t3\n", "𡃤\t賴\t3\n", "𡃶\t???\t3\n", "瀟\t???\t3\n", "軨\t???\t3\n", "鉻\t???\t3\n", "\t???\t3\n", "㿭\t斥\t3\n", "𢵄\t???\t3\n", "㗲\t???\t3\n", "𢫕\t???\t3\n", "𢰸\t???\t3\n", "葫\t???\t3\n", "咔\t???\t3\n", "嚎\t???\t3\n", "嗿\t???\t3\n", "咈\t o弗\t3\n", "咾\t嚕\t3\n", "\t???\t3\n", "𠵈\t妹\t3\n", "吥\t o不\t3\n", "𠾭\t???\t3\n", "𠾵\t???\t3\n", "朘\t俊\t3\n", "觥\t黃\t3\n", "㩧\t扌暴\t2\n", "焙\t???\t2\n", "兀\t???\t2\n", "䭤\t???\t2\n", "饊\t???\t2\n", "[\t???\t2\n", "]\t???\t2\n", "炖\t???\t2\n", "争\t爭\t2\n", "䁓\t???\t2\n", "𡂝\t???\t2\n", "𩬎\t壬\t2\n", "鈒\t閘\t2\n", "亁\t乾\t2\n", "炠\t灬甲\t2\n", "摼\t???\t2\n", "𠺬\t???\t2\n", "𠵉\t???\t2\n", "蝄\t???\t2\n", "\t???\t2\n", "蔫\t艹焉\t2\n", "㘉\t???\t2\n", "荏\t???\t2\n", "墘\t土乾\t2\n", "嗏\t搽\t2\n", "呣\t o母\t2\n", "曚\t矇\t2\n", "壬\t???\t2\n", "揅\t研\t2\n", "溼\t濕\t2\n", "囓\t咬\t2\n", "嚙\t咬\t2\n", "枴\t拐\t2\n", "𡃀\t???\t2\n", "饑\t???\t2\n", "䏭\t???\t2\n", "挼\t挪\t2\n", "掱\t???\t2\n", "咑\t打\t2\n", "芙\t???\t2\n", "𦂗\t???\t2\n", "舦\t軚\t2\n", "𢶤\t扌靴\t2\n", "翡\t???\t2\n", "翠\t???\t2\n", "酡\t酉它\t2\n", "𫭊\t???\t2\n", "煀\t火屈\t2\n", "耙\t???\t2\n", "𠿭\t滑\t2\n", "鉤\t鈎\t2\n", "𠻘\t???\t2\n", "脽\t離\t2\n", "焊\t???\t2\n", "唊\t o夾\t2\n", "胅\t⺼失\t2\n", "翕\t???\t2\n", "摜\t摔\t2\n", "僚\t???\t1\n", "𩗴\t???\t1\n", "毡\t???\t1\n", "跤\t???\t1\n", "梧\t???\t1\n", "痄\t疒乍\t1\n", "卟\t卜\t1\n", "劄\t札\t1\n", "𠶜\t制\t1\n", "睜\t???\t1\n", "迹\t跡\t1\n", "揃\t扌前\t1\n", "唨\t o阻\t1\n", "謢\t護\t1\n", "菻\t麻\t1\n", "𣚺\t???\t1\n", "鷓\t庶鳥\t1\n", "鴣\t古鳥\t1\n", "强\t???\t1\n", "𠾶\t???\t1\n", "𡆀\t轟\t1\n", "拫\t扌艮\t1\n", "𠼮\t偽\t1\n", "汞\t???\t1\n", "㤿\t???\t1\n", "厴\t???\t1\n", "𥀬\t???\t1\n", "牯\t???\t1\n", "𡇙\t???\t1\n", "讕\t賴\t1\n", "𠿫\t???\t1\n", "瘺\t婁\t1\n", "骲\t骨包\t1\n", "𫲭\t???\t1\n", "瓏\t玉龍\t1\n", "繚\t???\t1\n", "撿\t???\t1\n", "跀\t⻊月\t1\n", "𢛴\t掹\t1\n", "蝻\t虫南\t1\n", "赧\t羞赤\t1\n", "𪙛\t甩\t1\n", "\t???\t1\n", "檳\t???\t1\n", "潲\t餿\t1\n", "𢶠\t???\t1\n", "秧\t???\t1\n", "蒔\t???\t1\n", "炩\t灬令\t1\n", "㩋\t???\t1\n", "饅\t???\t1\n", "鍍\t???\t1\n", "𢚖\t???\t1\n", "𧊅\t虫另\t1\n", "\t???\t1\n", "篸\t???\t1\n", "𩟔\t???\t1\n", "撍\t賺\t1\n", "栗\t???\t1\n", "\t???\t1\n", "𡆇\t???\t1\n", "杧\t芒\t1\n", "榛\t???\t1\n", "蠄\t虫禽\t1\n", "蟧\t???\t1\n", "嘶\t???\t1\n", "梆\t???\t1\n", "竪\t豎\t1\n", "騾\t???\t1\n", "矺\t???\t1\n", "堀\t???\t1\n", "麝\t???\t1\n", "慪\t嘔\t1\n", "撴\t扌敦\t1\n", "哾\t啜\t1\n", "𠳖\t???\t1\n", "洌\t冽\t1\n", "霹\t???\t1\n", "𠾼\t???\t1\n", "𬦠\t???\t1\n", "𤌍\t???\t1\n", "𬧯\t???\t1\n", "厠\t廁\t1\n", "㖡\t???\t1\n", "跁\t⻊巴\t1\n", "鉎\t???\t1\n", "𧣈\t???\t1\n", "𠳏\t???\t1\n", "㹃\t非\t1\n", "𧝞\t???\t1\n", "𡀞\t???\t1\n", "㦒\t???\t1\n", "𩩍\t娉\t1\n", "𢱢\t???\t1\n", "鍟\t???\t1\n", "煱\t???\t1\n", "撘\t搭\t1\n", "閱\t???\t1\n", "橇\t喬\t1\n", "籽\t???\t1\n", "庵\t???\t1\n", "厨\t???\t1\n", "疴\t屙\t1\n", "豹\t???\t1\n", "杠\t槓\t1\n", "咘\t o布\t1\n", "裡\t???\t1\n", "熏\t燻\t1\n", "\t???\t1\n" ] } ], "source": [ "for c, freq in charset_sort_by_freq.items():\n", " if c in char_to_normalized_char:\n", " print(c + \"\\t\" + char_to_normalized_char[c] + \"\\t\" + str(freq))\n", " else:\n", " print(c + \"\\t\" + \"???\" + \"\\t\" + str(freq))" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 177 normalization mappings\n", "Sample of first 10 highest frequency mappings:\n", "[('𠹺', '埋'), ('噖', '琴'), ('𡁵', '緊'), ('𠶧', '掂'), ('嚫', '親'), ('屘', '尾'), ('衭', '褲'), ('贃', '賺'), ('說', '説'), ('𧵳', '蝕')]\n" ] } ], "source": [ "abc_mapping = {}\n", "\n", "with open(\"abc_rare_char_mapping.txt\", \"r\") as input_file:\n", " for line in input_file.read().splitlines():\n", " [c, n, freq] = line.split(\"\\t\")\n", " if len(n) == 1:\n", " abc_mapping[c] = n\n", "\n", "print(\"Loaded {} normalization mappings\".format(len(abc_mapping)))\n", "print(\"Sample of first 10 highest frequency mappings:\")\n", "print(list(abc_mapping.items())[:10])" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "# replace all occurence of rare characters with normalized ones\n", "def normalize_abc(line: str) -> str:\n", " for c, n in abc_mapping.items():\n", " line = line.replace(c, n)\n", " line = line.replace(\"而𠺢\", \"而家\").replace(\"依𠺢\", \"依家\")\n", " return line" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "with open(\"train/abc.can\", \"w+\") as output_file:\n", " for line in can_lines:\n", " output_file.write(normalize_abc(line) + \"\\n\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", " \n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }