File size: 2,804 Bytes
2bb0b26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# coding=utf-8
# author: xusong <xusong28@jd.com>
# time: 2021/10/9 14:32

"""

<风格>复古</风格>的旗袍款式

1. 先分词,再标签。

"""

import os
import re
from transformers import BertTokenizer
from collections import defaultdict, Counter


PATTEN_BIO = re.compile('<?.*>?')

def parse_tag_words(subwords):
    """ HC<领型>圆领</领型><风格>拼接</风格>连衣裙 """
    tags = []
    new_subwords = []
    i = 0
    entity = ''
    while i < len(subwords):
        if subwords[i] == '<':
            if entity == '':  # <领型>
                for j in range(i+1, len(subwords)):
                    if subwords[j] == '>':
                        break
                    entity += subwords[j]
            else:  # </领型>
                for j in range(i+1, len(subwords)):
                    if subwords[j] == '>':
                        break
                entity = ''
            i = j + 1
            continue

        if entity != '':   # 圆领
            for j in range(i, len(subwords)):
                if subwords[j] == '<':
                    i = j
                    break
                new_subwords.append(subwords[j])
                if j == i:
                    tags.append('B-' + entity)
                else:
                    tags.append('I-' + entity)
            continue

        tags.append('O')
        new_subwords.append(subwords[i])
        i = i + 1

    return new_subwords, tags




def bpe(part='train'):

    bpe_dir = 'bpe'
    # all_attr = [line.strip().split('\t')[0] for line in open('raw/vocab_bioattr.txt')]
    # BertTokenizer.SPECIAL_TOKENS_ATTRIBUTES += all_attr
    bpe = BertTokenizer('../vocab/vocab.jd.txt')  # never_split参数对tokenizer不起作用
    f_src = open(os.path.join(bpe_dir, part + '.src'), 'w', encoding='utf-8')
    f_tgt = open(os.path.join(bpe_dir, part + '.tgt'), 'w', encoding='utf-8')
    for line in open('raw/jdai.jave.fashion.' + part, 'r', encoding='utf-8'):
        cid, sid, sent, tag_sent = line.strip().split('\t')
        subwords = bpe._tokenize(tag_sent)
        subwords, tags = parse_tag_words(subwords)
        f_src.write(' '.join(subwords) + '\n')
        f_tgt.write(' '.join(tags) + '\n')



def find_all_entity():
    all_tag_sent = []
    for line in open('raw/jdai.jave.fashion.train', 'r', encoding='utf-8'):
        cid, sid, sent, tag_sent = line.strip().split('\t')
        all_tag_sent.append(tag_sent)
    entity_list = re.findall('<.*?>', ''.join(all_tag_sent))
    aa = Counter(entity_list)
    f_out = open('raw/vocab_bioattr.txt', 'w', encoding='utf-8')
    f_out.write(''.join(['{}\t{}\n'.format(k,cnt) for k, cnt in aa.items()]))


if __name__ == "__main__":
    # find_all_entity()
    for part in ['train', 'valid', 'test']:
        bpe(part)