mms-zeroshot1 / utils /norm_config.py
Vineel Pratap
autolm
6f27821
raw
history blame
6.53 kB
import os
import re
colon = ":"
comma = ","
exclamation_mark = "!"
period = re.escape(".")
question_mark = re.escape("?")
semicolon = ";"
left_curly_bracket = "{"
right_curly_bracket = "}"
quotation_mark = '"'
basic_punc = (
period
+ question_mark
+ comma
+ colon
+ exclamation_mark
+ left_curly_bracket
+ right_curly_bracket
)
# General punc unicode block (0x2000-0x206F)
zero_width_space = r"\u200B"
zero_width_nonjoiner = r"\u200C"
left_to_right_mark = r"\u200E"
right_to_left_mark = r"\u200F"
left_to_right_embedding = r"\u202A"
pop_directional_formatting = r"\u202C"
# Here are some commonly ill-typed versions of apostrophe
right_single_quotation_mark = r"\u2019"
left_single_quotation_mark = r"\u2018"
# Language specific definitions
# Spanish
inverted_exclamation_mark = r"\u00A1"
inverted_question_mark = r"\u00BF"
# Hindi
hindi_danda = u"\u0964"
# Egyptian Arabic
# arabic_percent = r"\u066A"
arabic_comma = r"\u060C"
arabic_question_mark = r"\u061F"
arabic_semicolon = r"\u061B"
arabic_diacritics = r"\u064B-\u0652"
arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"
# Chinese
full_stop = r"\u3002"
full_comma = r"\uFF0C"
full_exclamation_mark = r"\uFF01"
full_question_mark = r"\uFF1F"
full_semicolon = r"\uFF1B"
full_colon = r"\uFF1A"
full_parentheses = r"\uFF08\uFF09"
quotation_mark_horizontal = r"\u300C-\u300F"
quotation_mark_vertical = r"\uFF41-\uFF44"
title_marks = r"\u3008-\u300B"
wavy_low_line = r"\uFE4F"
ellipsis = r"\u22EF"
enumeration_comma = r"\u3001"
hyphenation_point = r"\u2027"
forward_slash = r"\uFF0F"
wavy_dash = r"\uFF5E"
box_drawings_light_horizontal = r"\u2500"
fullwidth_low_line = r"\uFF3F"
chinese_punc = (
full_stop
+ full_comma
+ full_exclamation_mark
+ full_question_mark
+ full_semicolon
+ full_colon
+ full_parentheses
+ quotation_mark_horizontal
+ quotation_mark_vertical
+ title_marks
+ wavy_low_line
+ ellipsis
+ enumeration_comma
+ hyphenation_point
+ forward_slash
+ wavy_dash
+ box_drawings_light_horizontal
+ fullwidth_low_line
)
# Armenian
armenian_apostrophe = r"\u055A"
emphasis_mark = r"\u055B"
exclamation_mark = r"\u055C"
armenian_comma = r"\u055D"
armenian_question_mark = r"\u055E"
abbreviation_mark = r"\u055F"
armenian_full_stop = r"\u0589"
armenian_punc = (
armenian_apostrophe
+ emphasis_mark
+ exclamation_mark
+ armenian_comma
+ armenian_question_mark
+ abbreviation_mark
+ armenian_full_stop
)
lesser_than_symbol = r"<"
greater_than_symbol = r">"
lesser_than_sign = r"\u003c"
greater_than_sign = r"\u003e"
nbsp_written_form = r"&nbsp"
# Quotation marks
left_double_quotes = r"\u201c"
right_double_quotes = r"\u201d"
left_double_angle = r"\u00ab"
right_double_angle = r"\u00bb"
left_single_angle = r"\u2039"
right_single_angle = r"\u203a"
low_double_quotes = r"\u201e"
low_single_quotes = r"\u201a"
high_double_quotes = r"\u201f"
high_single_quotes = r"\u201b"
all_punct_quotes = (
left_double_quotes
+ right_double_quotes
+ left_double_angle
+ right_double_angle
+ left_single_angle
+ right_single_angle
+ low_double_quotes
+ low_single_quotes
+ high_double_quotes
+ high_single_quotes
+ right_single_quotation_mark
+ left_single_quotation_mark
)
mapping_quotes = (
"["
+ high_single_quotes
+ right_single_quotation_mark
+ left_single_quotation_mark
+ "]"
)
# Digits
english_digits = r"\u0030-\u0039"
bengali_digits = r"\u09e6-\u09ef"
khmer_digits = r"\u17e0-\u17e9"
devanagari_digits = r"\u0966-\u096f"
oriya_digits = r"\u0b66-\u0b6f"
extended_arabic_indic_digits = r"\u06f0-\u06f9"
kayah_li_digits = r"\ua900-\ua909"
fullwidth_digits = r"\uff10-\uff19"
malayam_digits = r"\u0d66-\u0d6f"
myanmar_digits = r"\u1040-\u1049"
roman_numeral = r"\u2170-\u2179"
nominal_digit_shapes = r"\u206f"
# Load punctuations from MMS-lab data
with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
punc_list = punc_f.readlines()
punct_pattern = r""
for punc in punc_list:
# the first character in the tab separated line is the punc to be removed
punct_pattern += re.escape(punc.split("\t")[0])
shared_digits = (
english_digits
+ bengali_digits
+ khmer_digits
+ devanagari_digits
+ oriya_digits
+ extended_arabic_indic_digits
+ kayah_li_digits
+ fullwidth_digits
+ malayam_digits
+ myanmar_digits
+ roman_numeral
+ nominal_digit_shapes
)
shared_punc_list = (
basic_punc
+ all_punct_quotes
+ greater_than_sign
+ lesser_than_sign
+ inverted_question_mark
+ full_stop
+ semicolon
+ armenian_punc
+ inverted_exclamation_mark
+ arabic_comma
+ enumeration_comma
+ hindi_danda
+ quotation_mark
+ arabic_semicolon
+ arabic_question_mark
+ chinese_punc
+ punct_pattern
)
shared_mappping = {
lesser_than_symbol: "",
greater_than_symbol: "",
nbsp_written_form: "",
r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2",
}
shared_deletion_list = (
left_to_right_mark
+ zero_width_nonjoiner
+ arabic_subscript_alef_and_inverted_damma
+ zero_width_space
+ arabic_diacritics
+ pop_directional_formatting
+ right_to_left_mark
+ left_to_right_embedding
)
norm_config = {
"*": {
"lower_case": True,
"punc_set": shared_punc_list,
"del_set": shared_deletion_list,
"mapping": shared_mappping,
"digit_set": shared_digits,
"unicode_norm": "NFKC",
"rm_diacritics" : False,
}
}
#=============== Mongolian ===============#
norm_config["mon"] = norm_config["*"].copy()
# add soft hyphen to punc list to match with fleurs
norm_config["mon"]["del_set"] += r"\u00AD"
norm_config["khk"] = norm_config["mon"].copy()
#=============== Hebrew ===============#
norm_config["heb"] = norm_config["*"].copy()
# add "HEBREW POINT" symbols to match with fleurs
norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
#=============== Thai ===============#
norm_config["tha"] = norm_config["*"].copy()
# add "Zero width joiner" symbols to match with fleurs
norm_config["tha"]["punc_set"] += r"\u200D"
#=============== Arabic ===============#
norm_config["ara"] = norm_config["*"].copy()
norm_config["ara"]["mapping"]["ٱ"] = "ا"
norm_config["arb"] = norm_config["ara"].copy()
#=============== Javanese ===============#
norm_config["jav"] = norm_config["*"].copy()
norm_config["jav"]["rm_diacritics"] = True