""" ## 简介 - bert和clue词典比较 https://github.com/CLUEbenchmark/CLUECorpus2020#%E8%AF%8D%E8%A1%A8%E4%BB%8B%E7%BB%8D - 相关issue: https://github.com/google-research/bert/issues/396 - bert中文词典大小21128(2万) - 英文字母都小写了(有没有不小写的?) - args: - - output: - python bpe_oov.py \ --vocab-bpe vocab.google.txt \ --inputs ../raw/discovery_all \ --workers 60 # stderr打印在屏幕,stdout放在oov_lines python bpe_oov.py \ --vocab-bpe vocab.clue_plus.txt \ --inputs ../raw/discovery_all \ --workers 60 > oov_lines python bpe_oov.py \ --vocab-bpe vocab.clue_plus.txt \ --inputs ../raw/small/jd.train.raw \ --workers 60 > oov_lines ## 整词 """ import argparse from transformers import BertTokenizer import contextlib import sys from collections import defaultdict from multiprocessing import Pool def main(): parser = argparse.ArgumentParser() parser.add_argument( "--vocab-bpe", type=str, help='path to vocab.bpe', ) parser.add_argument( "--inputs", nargs="+", default=['-'], help="input files to filter/encode", ) parser.add_argument("--workers", type=int, default=20) args = parser.parse_args() with contextlib.ExitStack() as stack: inputs = [ stack.enter_context(open(input, "r", encoding="utf-8")) if input != "-" else sys.stdin for input in args.inputs ] encoder = MultiprocessingEncoder(args.vocab_bpe) pool = Pool(args.workers, initializer=encoder.initializer) oov_lines = pool.imap(encoder.get_oov_lines, zip(*inputs), 100) oov_count = defaultdict(int) for i, oov_line in enumerate(oov_lines, start=1): # 主要的计算模块 for oov in oov_line: oov_count[oov] += 1 if i % 10000 == 0: print("processed {} lines".format(i), file=sys.stderr) sorted_oov = sorted(oov_count.items(), key=lambda kv:kv[1], reverse=True) with open('oov', 'w', encoding='utf-8') as f_out: f_out.write('\n'.join(['%s %d' % (k,v) for k, v in sorted_oov])) class MultiprocessingEncoder(object): def __init__(self, vocab_bpe): self.vocab_bpe = vocab_bpe def initializer(self): # 为啥不放到 __init__ ? global bpe # 为什么用global,设置成成员变量不行吗? bpe = BertTokenizer(self.vocab_bpe) def get_oov(self, line): global bpe oov_tokens = [] for token in bpe.basic_tokenizer.tokenize(line, never_split=bpe.all_special_tokens): for sub_token in bpe.wordpiece_tokenizer.tokenize(token): if sub_token == '[UNK]': oov_tokens.append(token) if len(oov_tokens) > 0: # 不用在这里打印,因为有些明显需要添加的token print(','.join(oov_tokens) + '\t' + line) return oov_tokens def encode(self, line): global bpe ids = bpe.encode(line) return list(map(str, ids)) def decode(self, tokens): global bpe return bpe.decode(tokens) def get_oov_lines(self, lines): """ Encode a set of lines. All lines will be encoded together. """ all_oov = [] for line in lines: line = line.strip() oov_tokens = self.get_oov(line) all_oov += oov_tokens return all_oov def encode_lines(self, lines): """ Encode a set of lines. All lines will be encoded together. """ enc_lines = [] for line in lines: line = line.strip() if len(line) == 0 and not self.args.keep_empty: return ["EMPTY", None] tokens = self.encode(line) enc_lines.append(" ".join(tokens)) return ["PASS", enc_lines] def test(): encoder = MultiprocessingEncoder('vocab.clue_plus.txt') encoder.initializer() line = '蔲驰的,africa❸ 11111111111165000mg❗2⃣piqueddasdasddasdasda,明天25℃,面积120㎡,大约2~3米' \ '3200×1800分辨率,TAS海关密码锁,PC镜片,采用A+节能能,胶包裏,包裹,薄至6㎜,鬼塚虎,' \ '多种矿物元素,特别是锶,靚眼,门闩和便携把手,箜篌刺绣,5㎝,锐蝮蛇竞技版鼠标,滑屛式,T桖,sub+dvi,' \ '呵护牙齦,Baumatic™ ,' en = encoder.encode(line) print(line) print(en) print(encoder.decode(en)) if __name__ == "__main__": #main() test()