# coding=utf-8 # author: xusong # time: 2021/10/9 14:32 """ <风格>复古的旗袍款式 1. 先分词,再标签。 """ import os import re from transformers import BertTokenizer from collections import defaultdict, Counter PATTEN_BIO = re.compile('?') def parse_tag_words(subwords): """ HC<领型>圆领<风格>拼接连衣裙 """ tags = [] new_subwords = [] i = 0 entity = '' while i < len(subwords): if subwords[i] == '<': if entity == '': # <领型> for j in range(i+1, len(subwords)): if subwords[j] == '>': break entity += subwords[j] else: # for j in range(i+1, len(subwords)): if subwords[j] == '>': break entity = '' i = j + 1 continue if entity != '': # 圆领 for j in range(i, len(subwords)): if subwords[j] == '<': i = j break new_subwords.append(subwords[j]) if j == i: tags.append('B-' + entity) else: tags.append('I-' + entity) continue tags.append('O') new_subwords.append(subwords[i]) i = i + 1 return new_subwords, tags def bpe(part='train'): bpe_dir = 'bpe' # all_attr = [line.strip().split('\t')[0] for line in open('raw/vocab_bioattr.txt')] # BertTokenizer.SPECIAL_TOKENS_ATTRIBUTES += all_attr bpe = BertTokenizer('../vocab/vocab.jd.txt') # never_split参数对tokenizer不起作用 f_src = open(os.path.join(bpe_dir, part + '.src'), 'w', encoding='utf-8') f_tgt = open(os.path.join(bpe_dir, part + '.tgt'), 'w', encoding='utf-8') for line in open('raw/jdai.jave.fashion.' + part, 'r', encoding='utf-8'): cid, sid, sent, tag_sent = line.strip().split('\t') subwords = bpe._tokenize(tag_sent) subwords, tags = parse_tag_words(subwords) f_src.write(' '.join(subwords) + '\n') f_tgt.write(' '.join(tags) + '\n') def find_all_entity(): all_tag_sent = [] for line in open('raw/jdai.jave.fashion.train', 'r', encoding='utf-8'): cid, sid, sent, tag_sent = line.strip().split('\t') all_tag_sent.append(tag_sent) entity_list = re.findall('<.*?>', ''.join(all_tag_sent)) aa = Counter(entity_list) f_out = open('raw/vocab_bioattr.txt', 'w', encoding='utf-8') f_out.write(''.join(['{}\t{}\n'.format(k,cnt) for k, cnt in aa.items()])) if __name__ == "__main__": # find_all_entity() for part in ['train', 'valid', 'test']: bpe(part)