## vocab.txt ``` るのは よね 写真,寫真,冩真,写眞,寫眞,冩眞 マイ そん 女性,𠨰性,⼥性,女𧢱,𠨰𧢱,⼥𧢱 内容,內容,内㣑,内㝐,内彮,内𠕺,內㣑,內㝐,內彮,內𠕺 ``` 怎么还有不同写法?? ## 文本归一化 以下的normalization,在生成任务中并不好。 ``` self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)") self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*") self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}") self.content_repatter4 = re.compile( r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*" ) self.content_repatter5 = re.compile( r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*" ) self.content_repatter6 = re.compile( r"((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*億)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*万)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*千)*(0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*(千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+(\(税込\)|\(税抜\)|\+tax)*" ) def clean_text(self, content): content = self.content_repatter1.sub("", content) content = self.content_repatter2.sub("", content) content = self.content_repatter3.sub("", content) content = self.content_repatter4.sub("", content) content = self.content_repatter5.sub("", content) content = self.content_repatter6.sub("", content) content = content.translate(self.content_trans1) while "" in content: content = content.replace("", "") return content def tokenize(self, text, clean=False): text = text.replace(" ", "") text = text.replace(" ", "") text = text.replace("\r\n", "
") text = text.replace("\n", "
") text = text.replace("\r", "
") text = text.replace("\t", "") text = text.replace("—", "ー") text = text.replace("−", "ー") for k, v in self.emoji["emoji"].items(): if k in text: text = text.replace(k, v) if clean: text = self.clean_text(text) ```