|
|
|
from tqdm import tqdm |
|
import jieba |
|
import random |
|
|
|
trainFile = 'data/output.txt' |
|
trainDataVecFile = 'traindata_vec.txt' |
|
|
|
devFile = 'data/output2.txt' |
|
devDataVecFile = 'devdata_vec.txt' |
|
|
|
labelFile = 'data/label2.txt' |
|
stopwordFile = 'data/stopword.txt' |
|
|
|
wordLabelFile = 'wordLabel.txt' |
|
|
|
maxLen = 20 |
|
|
|
title_ind = [1, 2, 3, 4] |
|
title_ind.extend([0] * 16) |
|
|
|
|
|
|
|
def read_labelFile(file): |
|
data = open(file, 'r', encoding='utf_8').read().split('\n') |
|
data.remove('') |
|
label_w2n = {} |
|
label_n2w = {} |
|
for line in tqdm(data, desc='read label'): |
|
line = line.split(' ') |
|
name_w = line[0] |
|
name_n = int(line[1]) |
|
label_w2n[name_w] = name_n |
|
label_n2w[name_n] = name_w |
|
|
|
return label_w2n, label_n2w |
|
|
|
|
|
def read_stopword(file): |
|
data = open(file, 'r', encoding='utf_8').read().split('\n') |
|
|
|
return data |
|
|
|
|
|
def get_worddict(file): |
|
datas = open(file, 'r', encoding='utf_8').read().split('\n') |
|
datas = list(filter(None, datas)) |
|
word2ind = {} |
|
for line in tqdm(datas, desc="get_worddict"): |
|
line = line.split(' ') |
|
word2ind[line[0]] = int(line[1]) |
|
|
|
ind2word = {word2ind[w]: w for w in word2ind} |
|
return word2ind, ind2word |
|
|
|
|
|
def json2txt(): |
|
label_dict, label_n2w = read_labelFile(labelFile) |
|
word2ind, ind2word = get_worddict(wordLabelFile) |
|
stoplist = read_stopword(stopwordFile) |
|
cla_dict = {} |
|
|
|
|
|
traindataTxt = open(trainDataVecFile, 'w') |
|
datas = open(trainFile, 'r', encoding='utf_8').readlines() |
|
datas = list(filter(None, datas)) |
|
random.shuffle(datas) |
|
for line in tqdm(datas, desc="traindata to vec"): |
|
line = line.replace('\n', '').split(':') |
|
|
|
cla = line[1] |
|
|
|
|
|
if cla in cla_dict: |
|
cla_dict[cla] += 1 |
|
else: |
|
cla_dict[cla] = 1 |
|
|
|
cla_ind = label_dict[cla] |
|
title_seg = ['我', '要', '下', '单'] |
|
title_seg = [i for i in line[0]] |
|
|
|
title_ind = [cla_ind] |
|
for w in title_seg: |
|
if w in stoplist: |
|
continue |
|
title_ind.append(word2ind[w]) |
|
length = len(title_ind) |
|
if length > maxLen + 1: |
|
title_ind = title_ind[0:21] |
|
if length < maxLen + 1: |
|
title_ind.extend([0] * (maxLen - length + 1)) |
|
|
|
for n in title_ind: |
|
traindataTxt.write(str(n) + ',') |
|
traindataTxt.write('\n') |
|
|
|
|
|
traindataTxt = open(devDataVecFile, 'w') |
|
datas = open(devFile, 'r', encoding='utf_8').readlines() |
|
datas = list(filter(None, datas)) |
|
random.shuffle(datas) |
|
for line in tqdm(datas, desc="dev to vec"): |
|
line = line.replace('\n', '').split(':') |
|
|
|
cla = line[1] |
|
|
|
|
|
if cla in cla_dict: |
|
cla_dict[cla] += 1 |
|
else: |
|
cla_dict[cla] = 1 |
|
|
|
cla_ind = label_dict[cla] |
|
title_seg = [i for i in line[0]] |
|
|
|
title_ind = [cla_ind] |
|
for w in title_seg: |
|
if w in stoplist: |
|
continue |
|
title_ind.append(word2ind[w]) |
|
length = len(title_ind) |
|
if length > maxLen + 1: |
|
title_ind = title_ind[0:21] |
|
if length < maxLen + 1: |
|
title_ind.extend([0] * (maxLen - length + 1)) |
|
|
|
for n in title_ind: |
|
traindataTxt.write(str(n) + ',') |
|
traindataTxt.write('\n') |
|
|
|
cla_list = sorted(cla_dict.items(), key=lambda item: item[0], reverse=True) |
|
f = open('cla_length.txt', 'w', encoding='utf_8') |
|
total = 0 |
|
for t in cla_list: |
|
a = str(t[0]) |
|
d = str(t[0]) + ' ' + str(label_dict[a]) + ' ' + str(t[1]) + '\n' |
|
total += t[1] |
|
f.write(d) |
|
|
|
f.write('total: ' + str(total)) |
|
|
|
|
|
|
|
|
|
def main(): |
|
json2txt() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|