deepthulac-pos / cctokenizer.py
chengzl18's picture
Upload folder using huggingface_hub
34423c9
raw
history blame
24.1 kB
"""Tokenization classes for ChineseCharTokenizer."""
from typing import Optional, Tuple, Union
from transformers import BertTokenizer
import numpy as np
import os
import re
import shutil
# https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html
# https://www.microfocus.com/documentation/idol/IDOL/Servers/IDOLServer/11.2/Guides/html/English/expert/Content/IDOLExpert/Languages/Script_Ranges.htm
# https://www.ssec.wisc.edu/~tomw/java/unicode.html
# https://character-table.netlify.app/
# https://character-table.netlify.app/french/
# https://www.compart.com/en/unicode/U+31CE 重要:看各个unicode的编码语义信息,属于哪个编码段(block/plane)
# 联合国六个官方语言:阿拉伯文、中文、英文、法文、俄文、西班牙文
# [U_LAT] 拉丁语系的文字,包括英语、法语、西班牙语、德语等
# [U_RUS] 俄语等
# [U_ARA] 表示阿拉伯语
# [U_JAP] 日语
# [U_KOR] 韩语
# [U_LAN] 某种语言的文字,藏语, 泰语等等
# [U_CHI] 未知中文
# [U_PHO] 注音文字,音标等
# [U_RAD] 部首,笔画
# [U_PUN] 标点
# [U_GRE] 希腊字母
# [U_SYM] 各种各样的符号
# [U_COM] 组合符号,包括上下标
# [U_NUM] 序号,包括VIII,(1),(一)等,也包含一些数字
# [U_MAT] 数学符号
# [U_EMO] 表情
# ¤ 货币符号
unicode_map = [
{'token': '[U_LAT]', 'range': (0x0000, 0x007F), 'meaning': 'Basic Latin', },
{'token': '[U_LAT]', 'range': (0x0080, 0x00FF), 'meaning': 'C1 Controls and Latin-1 Supplement', }, # 拉丁字符,有德语、法语
{'token': '[U_LAT]', 'range': (0x0100, 0x017F), 'meaning': 'Latin Extended-A', }, # 有法语,例如œ
{'token': '[U_LAT]', 'range': (0x0180, 0x024F), 'meaning': 'Latin Extended-B', }, # 有法语,例如Ÿ
{'token': '[U_PHO]', 'range': (0x0250, 0x02AF), 'meaning': 'IPA Extensions', }, # 国际音标,例如ə
{'token': '[U_PHO]', 'range': (0x02B0, 0x02FF), 'meaning': 'Spacing Modifier Letters', }, # 国际音标,例如ʳ
{'token': '[U_PHO]', 'range': (0x0300, 0x036F), 'meaning': 'Combining Diacritical Marks', }, # 变音符号
{'token': '[U_GRE]', 'range': (0x0370, 0x03FF), 'meaning': 'Greek/Coptic', }, # 希腊字符,例如α
{'token': '[U_RUS]', 'range': (0x0400, 0x04FF), 'meaning': 'Cyrillic', }, # 有俄语
{'token': '[U_RUS]', 'range': (0x0500, 0x052F), 'meaning': 'Cyrillic Supplement', },
{'token': '[U_LAN]', 'range': (0x0530, 0x058F), 'meaning': 'Armenian', }, # 亚美尼亚语,属于印欧语系
{'token': '[U_LAN]', 'range': (0x0590, 0x05FF), 'meaning': 'Hebrew', }, # 希伯来语,犹太族用,有以色列,属于亚非语系
{'token': '[U_ARA]', 'range': (0x0600, 0x06FF), 'meaning': 'Arabic', }, # 阿拉伯语,联合国用,有沙特,属于亚非语系
{'token': '[U_LAN]', 'range': (0x0700, 0x074F), 'meaning': 'Syriac', }, # 古叙利亚语
{'token': '[U_ARA]', 'range': (0x0750, 0x077F), 'meaning': 'Undefined -> Arabic', },
{'token': '[U_LAN]', 'range': (0x0780, 0x07BF), 'meaning': 'Thaana', }, # 它拿字母, 马尔代夫用,属于印欧语系
{'token': '[U_ARA]', 'range': (0x07C0, 0x08FF), 'meaning': 'Undefined -> Arabic', },
{'token': '[U_LAN]', 'range': (0x0900, 0x097F), 'meaning': 'Devanagari', }, # 梵语, 印度宗教用,属于印欧语系
{'token': '[U_LAN]', 'range': (0x0980, 0x09FF), 'meaning': 'Bengali/Assamese', }, # 孟加拉语
{'token': '[U_LAN]', 'range': (0x0A00, 0x0A7F), 'meaning': 'Gurmukhi', }, # 古木基文,彭加语,印度用
{'token': '[U_LAN]', 'range': (0x0A80, 0x0AFF), 'meaning': 'Gujarati', }, # 古吉拉特语,印度用
{'token': '[U_LAN]', 'range': (0x0B00, 0x0B7F), 'meaning': 'Oriya', }, # 印度用
{'token': '[U_LAN]', 'range': (0x0B80, 0x0BFF), 'meaning': 'Tamil', }, # 印度用
{'token': '[U_LAN]', 'range': (0x0C00, 0x0C7F), 'meaning': 'Telugu', }, # 印度用
{'token': '[U_LAN]', 'range': (0x0C80, 0x0CFF), 'meaning': 'Kannada', }, # 印度用
{'token': '[U_LAN]', 'range': (0x0D00, 0x0DFF), 'meaning': 'Malayalam', }, # 印度用
# {'token': '[U_LAN]', 'range': (0x0D80, 0x0DFF), 'meaning': 'Sinhala', }, # 僧伽罗语,斯里兰卡用,近印度
{'token': '[U_LAN]', 'range': (0x0E00, 0x0E7F), 'meaning': 'Thai', }, # 泰语
{'token': '[U_LAN]', 'range': (0x0E80, 0x0EFF), 'meaning': 'Lao', }, # 老挝语
{'token': '[U_LAN]', 'range': (0x0F00, 0x0FFF), 'meaning': 'Tibetan', }, # 藏语 NOTE: 藏语是否需要单独列出?
{'token': '[U_LAN]', 'range': (0x1000, 0x109F), 'meaning': 'Myanmar', }, # 缅甸语
{'token': '[U_LAN]', 'range': (0x10A0, 0x10FF), 'meaning': 'Georgian', }, # 格鲁吉亚语,伊朗也用
{'token': '[U_KOR]', 'range': (0x1100, 0x11FF), 'meaning': 'Hangul Jamo', }, # 谚文,有古韩语
{'token': '[U_LAN]', 'range': (0x1200, 0x137F), 'meaning': 'Ethiopic', }, # 埃塞俄比亚语,非洲,仅次于阿拉伯语
{'token': '[U_LAN]', 'range': (0x1380, 0x139F), 'meaning': 'Undefined -> Ethiopic', },
{'token': '[U_LAN]', 'range': (0x13A0, 0x13FF), 'meaning': 'Cherokee', }, # 切罗基语,北美原住民
{'token': '[U_LAN]', 'range': (0x1400, 0x167F), 'meaning': 'Unified Canadian Aboriginal Syllabics', }, # 加拿大澳大利亚音标
{'token': '[U_LAN]', 'range': (0x1680, 0x169F), 'meaning': 'Ogham', }, # 欧甘字母,古爱尔兰用
{'token': '[U_LAN]', 'range': (0x16A0, 0x16FF), 'meaning': 'Runic', }, # 卢恩字母,古北欧用
{'token': '[U_LAN]', 'range': (0x1700, 0x171F), 'meaning': 'Tagalog', }, # 他加禄语,菲律宾及东南亚用
{'token': '[U_LAN]', 'range': (0x1720, 0x173F), 'meaning': 'Hanunoo', }, # 菲律宾用
{'token': '[U_LAN]', 'range': (0x1740, 0x175F), 'meaning': 'Buhid', }, # 菲律宾用
{'token': '[U_LAN]', 'range': (0x1760, 0x177F), 'meaning': 'Tagbanwa', }, # 菲律宾用
{'token': '[U_LAN]', 'range': (0x1780, 0x17FF), 'meaning': 'Khmer', }, # 高棉语,柬埔寨用
{'token': '[U_LAN]', 'range': (0x1800, 0x18AF), 'meaning': 'Mongolian', }, # 蒙古语
{'token': '[U_LAN]', 'range': (0x18B0, 0x18FF), 'meaning': 'Undefined -> Unified Canadian Aboriginal Syllabics', },
{'token': '[U_LAN]', 'range': (0x1900, 0x194F), 'meaning': 'Limbu', }, # 林布语,尼泊尔用
{'token': '[U_LAN]', 'range': (0x1950, 0x197F), 'meaning': 'Tai Le', }, # 傣语,云南用
{'token': '[U_LAN]', 'range': (0x1980, 0x19DF), 'meaning': 'Undefined -> New Tai Lue', }, # 新傣语
{'token': '[U_LAN]', 'range': (0x19E0, 0x19FF), 'meaning': 'Khmer Symbols', }, # 高棉标点,柬埔寨用
{'token': '[U_LAN]', 'range': (0x1A00, 0x1CFF), 'meaning': 'Undefined -> Ol Chiki', }, # 桑塔利语,印度用
{'token': '[U_PHO]', 'range': (0x1D00, 0x1D7F), 'meaning': 'Phonetic Extensions', }, # 音标,例如法语的ᵈ
{'token': '[U_PHO]', 'range': (0x1D80, 0x1DFF), 'meaning': 'Undefined -> Phonetic Extensions Supplement', },
{'token': '[U_LAT]', 'range': (0x1E00, 0x1EFF), 'meaning': 'Latin Extended Additional', }, # 拉丁带修饰符号,例如ṡ
{'token': '[U_GRE]', 'range': (0x1F00, 0x1FFF), 'meaning': 'Greek Extended', }, # 希腊字符带修饰,例如Ᾱ
{'token': '[U_SYM]', 'range': (0x2000, 0x206F), 'meaning': 'General Punctuation', }, # 各种符号,例如千分之‰
{'token': '[U_COM]', 'range': (0x2070, 0x209F), 'meaning': 'Superscripts and Subscripts', }, # 上下标,例如₂
{'token': '¤', 'range': (0x20A0, 0x20CF), 'meaning': 'Currency Symbols', }, # 货币符号,例如欧元€
{'token': '[U_COM]', 'range': (0x20D0, 0x20FF), 'meaning': 'Combining Diacritical Marks for Symbols', }, # 奇怪的可组合符号
{'token': '[U_SYM]', 'range': (0x2100, 0x214F), 'meaning': 'Letterlike Symbols', }, # 符号,例如普朗克常数ℎ,摄氏度℃
{'token': '[U_NUM]', 'range': (0x2150, 0x218F), 'meaning': 'Number Forms', }, # 特殊形式数字,例如三分之一⅓,八Ⅷ
{'token': '[U_SYM]', 'range': (0x2190, 0x21FF), 'meaning': 'Arrows', }, # 箭头,例如→
{'token': '[U_MAT]', 'range': (0x2200, 0x22FF), 'meaning': 'Mathematical Operators', }, # 数学符号,例如减号−,不属于∉
{'token': '[U_SYM]', 'range': (0x2300, 0x23FF), 'meaning': 'Miscellaneous Technical', }, # 杂乱的符号,例如放大倍数⌀,右上角⌝
{'token': '[U_SYM]', 'range': (0x2400, 0x243F), 'meaning': 'Control Pictures', }, # 描述符:表达控制的符号,例如空␀,退出␛
{'token': '[U_SYM]', 'range': (0x2440, 0x245F), 'meaning': 'Optical Character Recognition', }, # OCR符号,例如⑀,⑂
{'token': '[U_NUM]', 'range': (0x2460, 0x24FF), 'meaning': 'Enclosed Alphanumerics', }, # 带框序号,例如①,⒆
{'token': '[U_SYM]', 'range': (0x2500, 0x257F), 'meaning': 'Box Drawing', }, # 画盒子符,例如┋,┓
{'token': '[U_SYM]', 'range': (0x2580, 0x259F), 'meaning': 'Block Elements', }, # 画块符,例如▉,▀
{'token': '[U_SYM]', 'range': (0x25A0, 0x25FF), 'meaning': 'Geometric Shapes', }, # 几何形状,例如△
{'token': '[U_SYM]', 'range': (0x2600, 0x26FF), 'meaning': 'Miscellaneous Symbols', }, # 杂乱的符号,例如多云☁,女♀
{'token': '[U_SYM]', 'range': (0x2700, 0x27BF), 'meaning': 'Dingbats', }, # 杂乱的符号✈➓
{'token': '[U_MAT]', 'range': (0x27C0, 0x27EF), 'meaning': 'Miscellaneous Mathematical Symbols-A', }, # 杂乱的数学符号⟂⟘
{'token': '[U_SYM]', 'range': (0x27F0, 0x27FF), 'meaning': 'Supplemental Arrows-A', }, # 补充箭头⟹
{'token': '[U_LAN]', 'range': (0x2800, 0x28FF), 'meaning': 'Braille Patterns', }, # 盲文,点字文,⠝⠟
{'token': '[U_SYM]', 'range': (0x2900, 0x297F), 'meaning': 'Supplemental Arrows-B', }, # ⥬
{'token': '[U_MAT]', 'range': (0x2980, 0x29FF), 'meaning': 'Miscellaneous Mathematical Symbols-B', }, # ⭬
{'token': '[U_MAT]', 'range': (0x2A00, 0x2AFF), 'meaning': 'Supplemental Mathematical Operators', }, # ⪆
{'token': '[U_SYM]', 'range': (0x2B00, 0x2BFF), 'meaning': 'Miscellaneous Symbols and Arrows'}, # ⭬
{'token': '[U_LAN]', 'range': (0x2C00, 0x2E7F), 'meaning': 'Undefined -> Coptic', }, # 科普特语,埃塞俄比亚语
{'token': '[U_RAD]', 'range': (0x2E80, 0x2EFF), 'meaning': 'CJK Radicals Supplement', }, # CJK中日韩统一表意文字,部首,例如⺘
{'token': '[U_RAD]', 'range': (0x2F00, 0x2FDF), 'meaning': 'Kangxi Radicals', }, # 康熙字典部首
{'token': '[U_SYM]', 'range': (0x2FE0, 0x2FEF), 'meaning': 'Undefined -> Symbol', },
{'token': '[U_SYM]', 'range': (0x2FF0, 0x2FFF), 'meaning': 'Ideographic Description Characters', }, # 描述符:表意文字结构,例如上下结构,左右结构,半包围结构
{'token': '[U_PUN]', 'range': (0x3000, 0x303F), 'meaning': 'CJK Symbols and Punctuation', }, # 中文标点 。
{'token': '[U_JAP]', 'range': (0x3040, 0x309F), 'meaning': 'Hiragana', }, # 日语平假名
{'token': '[U_JAP]', 'range': (0x30A0, 0x30FF), 'meaning': 'Katakana', }, # 日语片假名
{'token': '[U_PHO]', 'range': (0x3100, 0x312F), 'meaning': 'Bopomofo', }, # 汉语拼音字,例如 ㄠㄎ
{'token': '[U_KOR]', 'range': (0x3130, 0x318F), 'meaning': 'Hangul Compatibility Jamo', }, # 韩文
{'token': '[U_JAP]', 'range': (0x3190, 0x319F), 'meaning': 'Kanbun (Kunten)', }, # 汉文,日本用
{'token': '[U_PHO]', 'range': (0x31A0, 0x31BF), 'meaning': 'Bopomofo Extended', }, # 汉语拼音字,例如ㆠ
{'token': '[U_RAD]', 'range': (0x31C0, 0x31EF), 'meaning': 'Undefined -> CJK Strokes', }, # 汉字笔画
{'token': '[U_JAP]', 'range': (0x31F0, 0x31FF), 'meaning': 'Katakana Phonetic Extensions', }, # 片假名音标
{'token': '[U_NUM]', 'range': (0x3200, 0x32FF), 'meaning': 'Enclosed CJK Letters and Months', }, # 汉字序号,例如㈠
{'token': '[U_SYM]', 'range': (0x3300, 0x33FF), 'meaning': 'CJK Compatibility', }, # 单字符表达单位 平方厘米㎠,毫克㎎,23点㍯
{'token': '[U_CHI]', 'range': (0x3400, 0x4DBF), 'meaning': 'CJK Unified Ideographs Extension A', }, # 中文罕见字
{'token': '[U_SYM]', 'range': (0x4DC0, 0x4DFF), 'meaning': 'Yijing Hexagram Symbols', }, # 易经六十四卦,䷁ ䷖
{'token': '[U_CHI]', 'range': (0x4E00, 0x9FAF), 'meaning': 'CJK Unified Ideographs', }, # 中文
{'token': '[U_CHI]', 'range': (0x9FB0, 0x9FFF), 'meaning': 'Undefined -> CJK Unified Ideographs', }, # 中文
{'token': '[U_LAN]', 'range': (0xA000, 0xA48F), 'meaning': 'Yi Syllables', }, # 彝文字符,凉山彝族用
{'token': '[U_LAN]', 'range': (0xA490, 0xA4CF), 'meaning': 'Yi Radicals', }, # 彝文部首
{'token': '[U_LAN]', 'range': (0xA4D0, 0xABFF), 'meaning': 'Undefined -> Cherokee'}, # 彻罗基族, 北美印第安人
{'token': '[U_KOR]', 'range': (0xAC00, 0xD7AF), 'meaning': 'Hangul Syllables', }, # 韩语音节
{'token': '[U_KOR]', 'range': (0xD7B0, 0xD7FF), 'meaning': 'Undefined -> Hangul Jamo Extended-B', },
{'range': (0xD800, 0xDBFF), 'meaning': 'High Surrogate Area', 'token': '[UNK]'},
{'range': (0xDC00, 0xDFFF), 'meaning': 'Low Surrogate Area', 'token': '[UNK]'},
{'range': (0xE000, 0xF8FF), 'meaning': 'Private Use Area', 'token': '[UNK]'}, # NOTE 统计字频时,有很多落无法显示的字符落到了这一区域
{'token': '[U_CHI]', 'range': (0xF900, 0xFAFF), 'meaning': 'CJK Compatibility Ideographs', }, # 中文
{'token': '[U_LAT]', 'range': (0xFB00, 0xFB4F), 'meaning': 'Alphabetic Presentation Forms', }, # 拉丁、希伯来语字母排版符号
{'token': '[U_ARA]', 'range': (0xFB50, 0xFDFF), 'meaning': 'Arabic Presentation Forms-A', }, # 阿拉伯语排版符号
{'token': '[U_SYM]', 'range': (0xFE00, 0xFE0F), 'meaning': 'Variation Selectors', }, # 描述符: 表示选择第几个
{'token': '[U_PUN]', 'range': (0xFE10, 0xFE1F), 'meaning': 'Undefined -> Vertical Forms', }, # 竖排标点
{'token': '[U_COM]', 'range': (0xFE20, 0xFE2F), 'meaning': 'Combining Half Marks', }, # 可组合符号 例如$︡a
{'token': '[U_PUN]', 'range': (0xFE30, 0xFE4F), 'meaning': 'CJK Compatibility Forms', }, # 中文排版符号,例如竖排的符号︻︼
{'token': '[U_PUN]', 'range': (0xFE50, 0xFE6F), 'meaning': 'Small Form Variants', }, # 小符号,小逗号﹐,小问号﹖
{'token': '[U_ARA]', 'range': (0xFE70, 0xFEFF), 'meaning': 'Arabic Presentation Forms-B', }, # 阿拉伯语排版符号
{'token': '[U_LAT]', 'range': (0xFF00, 0xFFEF), 'meaning': 'Halfwidth and Fullwidth Forms', }, # 中文全角字符0c, TODO: 这里视为拉丁文,应该映射回拉丁的半角字符
{'token': '[U_SYM]', 'range': (0xFFF0, 0xFFFF), 'meaning': 'Specials', }, # 描述符: 可表示替换等操作
{'token': '[U_LAN]', 'range': (0x10000, 0x1007F), 'meaning': 'Linear B Syllabary', }, # 线形文字,像象形文字
{'token': '[U_LAN]', 'range': (0x10080, 0x100FF), 'meaning': 'Linear B Ideograms', },
{'token': '[U_LAN]', 'range': (0x10100, 0x1013F), 'meaning': 'Aegean Numbers', }, # 爱琴海象形数字
{'token': '[U_LAN]', 'range': (0x10140, 0x102FF), 'meaning': 'Undefined -> Carian', },
{'token': '[U_LAN]', 'range': (0x10300, 0x1032F), 'meaning': 'Old Italic', }, # 古意大利
{'token': '[U_LAN]', 'range': (0x10330, 0x1034F), 'meaning': 'Gothic', }, # 哥特语
{'range': (0x10350, 0x1037F), 'meaning': 'Undefined', 'token': '[UNK]'},
{'token': '[U_LAN]', 'range': (0x10380, 0x1039F), 'meaning': 'Ugaritic', }, # 古文字在叙利亚发现
{'token': '[U_LAN]', 'range': (0x103A0, 0x103FF), 'meaning': 'Undefined -> Old Persian', }, # 古波斯语
{'token': '[U_LAN]', 'range': (0x10400, 0x1044F), 'meaning': 'Deseret', }, # 北美原住民用
{'token': '[U_PHO]', 'range': (0x10450, 0x1047F), 'meaning': 'Shavian', }, # 萧伯纳的音标
{'token': '[U_LAN]', 'range': (0x10480, 0x104AF), 'meaning': 'Osmanya', }, # 索马里曾经用
{'token': '[U_LAN]', 'range': (0x104B0, 0x107FF), 'meaning': 'Undefined -> Osage', }, # 欧塞奇语
{'token': '[U_LAN]', 'range': (0x10800, 0x1083F), 'meaning': 'Cypriot Syllabary', }, # 塞浦路斯,地中海岛国
{'token': '[U_LAN]', 'range': (0x10840, 0x1CFFF), 'meaning': 'Undefined -> Cuneiform, Chakma, Kharoshthi...', }, # 许多种语言,包括楔形文字, 和一些符号𓆝 1319d 𓆟 1319f
{'token': '[U_LAN]', 'range': (0x1D000, 0x1D0FF), 'meaning': 'Byzantine Musical Symbols', }, # 拜占庭人音乐符号
{'token': '[U_SYM]', 'range': (0x1D100, 0x1D1FF), 'meaning': 'Musical Symbols', }, # 音乐符号
{'range': (0x1D200, 0x1D2FF), 'meaning': 'Undefined', 'token': '[UNK]'},
{'token': '[U_SYM]', 'range': (0x1D300, 0x1D35F), 'meaning': 'Tai Xuan Jing Symbols', }, # 太玄经符号
{'range': (0x1D360, 0x1D3FF), 'meaning': 'Undefined', 'token': '[UNK]'},
{'token': '[U_MAT]', 'range': (0x1D400, 0x1D7FF), 'meaning': 'Mathematical Alphanumeric Symbols', }, # 数学中用的字母,例如斜体的字母
{'token': '[U_LAN]', 'range': (0x1D800, 0x1F003), 'meaning': 'Undefined -> Adlam', }, # 阿德拉姆字母,用于西非
{'token': '[U_EMO]', 'range': (0x1F004, 0x1FAF8), 'meaning': 'Undefined -> Emoji', }, # emoji
{'token': '[U_SYM]', 'range': (0x1FAF9, 0x1FFFF), 'meaning': 'Undefined -> Symbols for Legacy Computing', },
{'token': '[U_CHI]', 'range': (0x20000, 0x2A6DF), 'meaning': 'CJK Unified Ideographs Extension B', }, # 中文罕见字
{'token': '[U_CHI]', 'range': (0x2A6E0, 0x2F7FF), 'meaning': 'Undefined -> CJK Unified Ideographs Extension F...', }, # 中文罕见字
{'token': '[U_CHI]', 'range': (0x2F800, 0x2FA1F), 'meaning': 'CJK Compatibility Ideographs Supplement', }, # 中文罕见字
{'range': (0x2FA20, 0x2FAAF), 'meaning': 'Undefined', 'token': '[UNK]'},
{'range': (0x2FAB0, 0x2FFFF), 'meaning': 'Unused', 'token': '[UNK]'},
{'token': '[U_CHI]', 'range': (0x30000, 0x3134F), 'meaning': 'Unused -> CJK Unified Ideographs Extension G (unassigned)', }, # 未使用的中文编码区间
{'range': (0x31350, 0xDFFFF), 'meaning': 'Unused', 'token': '[UNK]'},
{'token': '[U_SYM]', 'range': (0xE0000, 0xE007F), 'meaning': 'Tags', }, # 描述符: 表示标签
{'range': (0xE0080, 0xE00FF), 'meaning': 'Unused', 'token': '[UNK]'},
{'token': '[U_SYM]', 'range': (0xE0100, 0xE01EF), 'meaning': 'Variation Selectors Supplement', }, # 描述符: 表示选择
{'range': (0xE01F0, 0xEFFFF), 'meaning': 'Unused', 'token': '[UNK]'},
{'range': (0xF0000, 0xFFFFD), 'meaning': 'Supplementary Private Use Area-A', 'token': '[UNK]'},
{'range': (0xFFFFE, 0xFFFFF), 'meaning': 'Unused', 'token': '[UNK]'},
{'range': (0x100000, 0x10FFFD), 'meaning': 'Supplementary Private Use Area-B', 'token': '[UNK]'},
]
def get_unicode_ranges():
# 检查区间是否连续
left_bounds = [m['range'][0] for m in unicode_map]
right_bounds = [m['range'][1] for m in unicode_map]
for right, left in zip(right_bounds[:-1], left_bounds[1:]):
assert right+1 == left
return np.array(right_bounds)
def _is_chinese_char(cp):
# copied from transformers.models.bert.tokenization_bert.BasicTokenizer._is_chinese_char
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
def show_unicode(start=0x1F004, end=0x1FAF8):
# emoji 文件 https://www.unicode.org/Public/emoji/latest/emoji-sequences.txt
# https://github.com/hidehalo/emoji/issues/3
# https://apps.timwhitlock.info/emoji/tables/unicode
for i in range(start, end):
print(chr(i), end=' ')
print()
def load_json(file):
import json
with open(file, 'r', encoding='utf-8') as f:
obj = json.load(f)
return obj
class ChineseCharTokenizer(BertTokenizer):
vocab_files_names = {"vocab_file": "vocab.txt", 'mapping_file': "replace.json"}
def __init__(self, vocab_file, *args, **kwargs):
super(ChineseCharTokenizer, self).__init__(vocab_file, *args, **kwargs)
self.unicoder_ranges = get_unicode_ranges()
self.enclosed_tokens = {token for token in self.vocab if token[0] == '[' and token[-1] == ']' and 'unused' not in token}
self.enclosed_tokens_by_len = [
[token for token in self.enclosed_tokens if len(token) == 5],
[token for token in self.enclosed_tokens if len(token) == 6],
[token for token in self.enclosed_tokens if len(token) == 7]
]
self.dir = os.path.join(os.path.dirname(vocab_file))
self.replace_map = load_json(os.path.join(self.dir, 'replace.json'))
# # [EOS]相当于逗号、换行,不用看作special token
def convert_token_to_representative(self, token: str) -> str:
token = self.replace_map.get(token, token) # 异体字转换,繁简转换,全半角转换,大小写转换等
if token in self.vocab:
return token
else:
assert len(token) == 1, token
if re.match(r'\s', token): # 匹配\u2003, \t等
return ' '
v = ord(token)
if _is_chinese_char(v):
return '[U_CHI]'
elif v <= 0x10FFFD:
i = np.searchsorted(self.unicoder_ranges, v) # 找到插入位置 ranges[i-1] < v <= ranges[i]
return unicode_map[i]['token']
else:
return '[UNK]'
# bert的tokenize会加上CLS?
def _tokenize(self, text):
# 如果没有人为加的特殊赋好,可以不用这个tokenize,list(text)就是tokenize的结果
split_tokens = []
i = 0
while i < len(text):
if text[i:i+5] in self.enclosed_tokens_by_len[0]:
split_tokens.append(text[i:i+5])
i += 5
elif text[i:i+6] == '[MASK]':
split_tokens.append('[MASK]')
i += 6
elif text[i:i+7] in self.enclosed_tokens_by_len[2]:
split_tokens.append(text[i:i+7])
i += 7
else:
split_tokens.append(self.convert_token_to_representative(text[i]))
i += 1
return split_tokens
def _convert_token_to_id(self, token):
return self.vocab.get(self.convert_token_to_representative(token), self.vocab.get(self.unk_token)) # BUG: convert_token_to_representative 不是 id!
def convert_tokens_to_string(self, tokens):
return ''.join(tokens)
def save_pretrained(self, save_directory: Union[str, os.PathLike], legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, push_to_hub: bool = False, **kwargs) -> Tuple[str]:
ret = super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
shutil.copyfile(os.path.join(self.dir, 'replace.json'), f'{save_directory}/replace.json')
shutil.copyfile(os.path.join(self.dir, 'cctokenizer.py'), f'{save_directory}/cctokenizer.py')
return ret