# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Credits This code is modified from https://github.com/GitYCC/g2pW """ import os import re def wordize_and_map(text: str): words = [] index_map_from_text_to_word = [] index_map_from_word_to_text = [] while len(text) > 0: match_space = re.match(r"^ +", text) if match_space: space_str = match_space.group(0) index_map_from_text_to_word += [None] * len(space_str) text = text[len(space_str) :] continue match_en = re.match(r"^[a-zA-Z0-9]+", text) if match_en: en_word = match_en.group(0) word_start_pos = len(index_map_from_text_to_word) word_end_pos = word_start_pos + len(en_word) index_map_from_word_to_text.append((word_start_pos, word_end_pos)) index_map_from_text_to_word += [len(words)] * len(en_word) words.append(en_word) text = text[len(en_word) :] else: word_start_pos = len(index_map_from_text_to_word) word_end_pos = word_start_pos + 1 index_map_from_word_to_text.append((word_start_pos, word_end_pos)) index_map_from_text_to_word += [len(words)] words.append(text[0]) text = text[1:] return words, index_map_from_text_to_word, index_map_from_word_to_text def tokenize_and_map(tokenizer, text: str): words, text2word, word2text = wordize_and_map(text=text) tokens = [] index_map_from_token_to_text = [] for word, (word_start, word_end) in zip(words, word2text): word_tokens = tokenizer.tokenize(word) if len(word_tokens) == 0 or word_tokens == ["[UNK]"]: index_map_from_token_to_text.append((word_start, word_end)) tokens.append("[UNK]") else: current_word_start = word_start for word_token in word_tokens: word_token_len = len(re.sub(r"^##", "", word_token)) index_map_from_token_to_text.append( (current_word_start, current_word_start + word_token_len) ) current_word_start = current_word_start + word_token_len tokens.append(word_token) index_map_from_text_to_token = text2word for i, (token_start, token_end) in enumerate(index_map_from_token_to_text): for token_pos in range(token_start, token_end): index_map_from_text_to_token[token_pos] = i return tokens, index_map_from_text_to_token, index_map_from_token_to_text def _load_config(config_path: os.PathLike): import importlib.util spec = importlib.util.spec_from_file_location("__init__", config_path) config = importlib.util.module_from_spec(spec) spec.loader.exec_module(config) return config default_config_dict = { "manual_seed": 1313, "model_source": "bert-base-chinese", "window_size": 32, "num_workers": 2, "use_mask": True, "use_char_phoneme": False, "use_conditional": True, "param_conditional": { "affect_location": "softmax", "bias": True, "char-linear": True, "pos-linear": False, "char+pos-second": True, "char+pos-second_lowrank": False, "lowrank_size": 0, "char+pos-second_fm": False, "fm_size": 0, "fix_mode": None, "count_json": "train.count.json", }, "lr": 5e-5, "val_interval": 200, "num_iter": 10000, "use_focal": False, "param_focal": {"alpha": 0.0, "gamma": 0.7}, "use_pos": True, "param_pos ": { "weight": 0.1, "pos_joint_training": True, "train_pos_path": "train.pos", "valid_pos_path": "dev.pos", "test_pos_path": "test.pos", }, } def load_config(config_path: os.PathLike, use_default: bool = False): config = _load_config(config_path) if use_default: for attr, val in default_config_dict.items(): if not hasattr(config, attr): setattr(config, attr, val) elif isinstance(val, dict): d = getattr(config, attr) for dict_k, dict_v in val.items(): if dict_k not in d: d[dict_k] = dict_v return config