Spaces:
Running
Running
| import os | |
| import re | |
| colon = ":" | |
| comma = "," | |
| exclamation_mark = "!" | |
| period = re.escape(".") | |
| question_mark = re.escape("?") | |
| semicolon = ";" | |
| left_curly_bracket = "{" | |
| right_curly_bracket = "}" | |
| quotation_mark = '"' | |
| basic_punc = ( | |
| period | |
| + question_mark | |
| + comma | |
| + colon | |
| + exclamation_mark | |
| + left_curly_bracket | |
| + right_curly_bracket | |
| ) | |
| # General punc unicode block (0x2000-0x206F) | |
| zero_width_space = r"\u200B" | |
| zero_width_nonjoiner = r"\u200C" | |
| left_to_right_mark = r"\u200E" | |
| right_to_left_mark = r"\u200F" | |
| left_to_right_embedding = r"\u202A" | |
| pop_directional_formatting = r"\u202C" | |
| # Here are some commonly ill-typed versions of apostrophe | |
| right_single_quotation_mark = r"\u2019" | |
| left_single_quotation_mark = r"\u2018" | |
| # Language specific definitions | |
| # Spanish | |
| inverted_exclamation_mark = r"\u00A1" | |
| inverted_question_mark = r"\u00BF" | |
| # Hindi | |
| hindi_danda = "\u0964" | |
| # Egyptian Arabic | |
| # arabic_percent = r"\u066A" | |
| arabic_comma = r"\u060C" | |
| arabic_question_mark = r"\u061F" | |
| arabic_semicolon = r"\u061B" | |
| arabic_diacritics = r"\u064B-\u0652" | |
| arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657" | |
| # Chinese | |
| full_stop = r"\u3002" | |
| full_comma = r"\uFF0C" | |
| full_exclamation_mark = r"\uFF01" | |
| full_question_mark = r"\uFF1F" | |
| full_semicolon = r"\uFF1B" | |
| full_colon = r"\uFF1A" | |
| full_parentheses = r"\uFF08\uFF09" | |
| quotation_mark_horizontal = r"\u300C-\u300F" | |
| quotation_mark_vertical = r"\uFF41-\uFF44" | |
| title_marks = r"\u3008-\u300B" | |
| wavy_low_line = r"\uFE4F" | |
| ellipsis = r"\u22EF" | |
| enumeration_comma = r"\u3001" | |
| hyphenation_point = r"\u2027" | |
| forward_slash = r"\uFF0F" | |
| wavy_dash = r"\uFF5E" | |
| box_drawings_light_horizontal = r"\u2500" | |
| fullwidth_low_line = r"\uFF3F" | |
| chinese_punc = ( | |
| full_stop | |
| + full_comma | |
| + full_exclamation_mark | |
| + full_question_mark | |
| + full_semicolon | |
| + full_colon | |
| + full_parentheses | |
| + quotation_mark_horizontal | |
| + quotation_mark_vertical | |
| + title_marks | |
| + wavy_low_line | |
| + ellipsis | |
| + enumeration_comma | |
| + hyphenation_point | |
| + forward_slash | |
| + wavy_dash | |
| + box_drawings_light_horizontal | |
| + fullwidth_low_line | |
| ) | |
| # Armenian | |
| armenian_apostrophe = r"\u055A" | |
| emphasis_mark = r"\u055B" | |
| exclamation_mark = r"\u055C" | |
| armenian_comma = r"\u055D" | |
| armenian_question_mark = r"\u055E" | |
| abbreviation_mark = r"\u055F" | |
| armenian_full_stop = r"\u0589" | |
| armenian_punc = ( | |
| armenian_apostrophe | |
| + emphasis_mark | |
| + exclamation_mark | |
| + armenian_comma | |
| + armenian_question_mark | |
| + abbreviation_mark | |
| + armenian_full_stop | |
| ) | |
| lesser_than_symbol = r"<" | |
| greater_than_symbol = r">" | |
| lesser_than_sign = r"\u003c" | |
| greater_than_sign = r"\u003e" | |
| nbsp_written_form = r" " | |
| # Quotation marks | |
| left_double_quotes = r"\u201c" | |
| right_double_quotes = r"\u201d" | |
| left_double_angle = r"\u00ab" | |
| right_double_angle = r"\u00bb" | |
| left_single_angle = r"\u2039" | |
| right_single_angle = r"\u203a" | |
| low_double_quotes = r"\u201e" | |
| low_single_quotes = r"\u201a" | |
| high_double_quotes = r"\u201f" | |
| high_single_quotes = r"\u201b" | |
| all_punct_quotes = ( | |
| left_double_quotes | |
| + right_double_quotes | |
| + left_double_angle | |
| + right_double_angle | |
| + left_single_angle | |
| + right_single_angle | |
| + low_double_quotes | |
| + low_single_quotes | |
| + high_double_quotes | |
| + high_single_quotes | |
| + right_single_quotation_mark | |
| + left_single_quotation_mark | |
| ) | |
| mapping_quotes = ( | |
| "[" | |
| + high_single_quotes | |
| + right_single_quotation_mark | |
| + left_single_quotation_mark | |
| + "]" | |
| ) | |
| # Digits | |
| english_digits = r"\u0030-\u0039" | |
| bengali_digits = r"\u09e6-\u09ef" | |
| khmer_digits = r"\u17e0-\u17e9" | |
| devanagari_digits = r"\u0966-\u096f" | |
| oriya_digits = r"\u0b66-\u0b6f" | |
| extended_arabic_indic_digits = r"\u06f0-\u06f9" | |
| kayah_li_digits = r"\ua900-\ua909" | |
| fullwidth_digits = r"\uff10-\uff19" | |
| malayam_digits = r"\u0d66-\u0d6f" | |
| myanmar_digits = r"\u1040-\u1049" | |
| roman_numeral = r"\u2170-\u2179" | |
| nominal_digit_shapes = r"\u206f" | |
| # Load punctuations from MMS-lab data | |
| with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f: | |
| punc_list = punc_f.readlines() | |
| punct_pattern = r"" | |
| for punc in punc_list: | |
| # the first character in the tab separated line is the punc to be removed | |
| punct_pattern += re.escape(punc.split("\t")[0]) | |
| shared_digits = ( | |
| english_digits | |
| + bengali_digits | |
| + khmer_digits | |
| + devanagari_digits | |
| + oriya_digits | |
| + extended_arabic_indic_digits | |
| + kayah_li_digits | |
| + fullwidth_digits | |
| + malayam_digits | |
| + myanmar_digits | |
| + roman_numeral | |
| + nominal_digit_shapes | |
| ) | |
| shared_punc_list = ( | |
| basic_punc | |
| + all_punct_quotes | |
| + greater_than_sign | |
| + lesser_than_sign | |
| + inverted_question_mark | |
| + full_stop | |
| + semicolon | |
| + armenian_punc | |
| + inverted_exclamation_mark | |
| + arabic_comma | |
| + enumeration_comma | |
| + hindi_danda | |
| + quotation_mark | |
| + arabic_semicolon | |
| + arabic_question_mark | |
| + chinese_punc | |
| + punct_pattern | |
| ) | |
| shared_mappping = { | |
| lesser_than_symbol: "", | |
| greater_than_symbol: "", | |
| nbsp_written_form: "", | |
| # r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2", # slow to run | |
| } | |
| shared_deletion_list = ( | |
| left_to_right_mark | |
| + zero_width_nonjoiner | |
| + arabic_subscript_alef_and_inverted_damma | |
| + zero_width_space | |
| + arabic_diacritics | |
| + pop_directional_formatting | |
| + right_to_left_mark | |
| + left_to_right_embedding | |
| ) | |
| norm_config = { | |
| "*": { | |
| "lower_case": True, | |
| "punc_set": shared_punc_list, | |
| "del_set": shared_deletion_list, | |
| "mapping": shared_mappping, | |
| "digit_set": shared_digits, | |
| "unicode_norm": "NFKC", | |
| "rm_diacritics": False, | |
| } | |
| } | |
| # =============== Mongolian ===============# | |
| norm_config["mon"] = norm_config["*"].copy() | |
| # add soft hyphen to punc list to match with fleurs | |
| norm_config["mon"]["del_set"] += r"\u00AD" | |
| norm_config["khk"] = norm_config["mon"].copy() | |
| # =============== Hebrew ===============# | |
| norm_config["heb"] = norm_config["*"].copy() | |
| # add "HEBREW POINT" symbols to match with fleurs | |
| norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF" | |
| # =============== Thai ===============# | |
| norm_config["tha"] = norm_config["*"].copy() | |
| # add "Zero width joiner" symbols to match with fleurs | |
| norm_config["tha"]["punc_set"] += r"\u200D" | |
| # =============== Arabic ===============# | |
| norm_config["ara"] = norm_config["*"].copy() | |
| norm_config["ara"]["mapping"]["ٱ"] = "ا" | |
| norm_config["arb"] = norm_config["ara"].copy() | |
| # =============== Javanese ===============# | |
| norm_config["jav"] = norm_config["*"].copy() | |
| norm_config["jav"]["rm_diacritics"] = True | |