# -*- coding: utf_8 -*- from tqdm import tqdm import jieba import random trainFile = 'data/output.txt' # trainFile = 'data/train.txt' trainDataVecFile = 'traindata_vec.txt' devFile = 'data/output2.txt' # 'data/dev.txt' devDataVecFile = 'devdata_vec.txt' labelFile = 'data/label2.txt' # labelFile = 'data/label.txt' stopwordFile = 'data/stopword.txt' wordLabelFile = 'wordLabel.txt' maxLen = 20 title_ind = [1, 2, 3, 4] title_ind.extend([0] * 16) def read_labelFile(file): data = open(file, 'r', encoding='utf_8').read().split('\n') data.remove('') label_w2n = {} label_n2w = {} for line in tqdm(data, desc='read label'): line = line.split(' ') name_w = line[0] name_n = int(line[1]) label_w2n[name_w] = name_n label_n2w[name_n] = name_w return label_w2n, label_n2w def read_stopword(file): data = open(file, 'r', encoding='utf_8').read().split('\n') return data def get_worddict(file): datas = open(file, 'r', encoding='utf_8').read().split('\n') datas = list(filter(None, datas)) word2ind = {} for line in tqdm(datas, desc="get_worddict"): line = line.split(' ') word2ind[line[0]] = int(line[1]) ind2word = {word2ind[w]: w for w in word2ind} return word2ind, ind2word def json2txt(): label_dict, label_n2w = read_labelFile(labelFile) word2ind, ind2word = get_worddict(wordLabelFile) stoplist = read_stopword(stopwordFile) cla_dict = {} # train data to vec traindataTxt = open(trainDataVecFile, 'w') datas = open(trainFile, 'r', encoding='utf_8').readlines() datas = list(filter(None, datas)) random.shuffle(datas) for line in tqdm(datas, desc="traindata to vec"): line = line.replace('\n', '').split(':') # line = line.replace('\n','').split('\t') cla = line[1] # if cla in [21, 13, 9, 24, 23, 19, 14]: # continue if cla in cla_dict: cla_dict[cla] += 1 else: cla_dict[cla] = 1 cla_ind = label_dict[cla] title_seg = ['ζˆ‘', '要', 'δΈ‹', '单'] title_seg = [i for i in line[0]] # title_seg = jieba.cut(line[0], cut_all=False) title_ind = [cla_ind] for w in title_seg: if w in stoplist: continue title_ind.append(word2ind[w]) length = len(title_ind) if length > maxLen + 1: title_ind = title_ind[0:21] if length < maxLen + 1: title_ind.extend([0] * (maxLen - length + 1)) for n in title_ind: traindataTxt.write(str(n) + ',') traindataTxt.write('\n') # dev data to vec traindataTxt = open(devDataVecFile, 'w') datas = open(devFile, 'r', encoding='utf_8').readlines() datas = list(filter(None, datas)) random.shuffle(datas) for line in tqdm(datas, desc="dev to vec"): line = line.replace('\n', '').split(':') # line = line.replace('\n', '').split('\t') cla = line[1] # if cla in [21, 13, 9, 24, 23, 19, 14]: # continue if cla in cla_dict: cla_dict[cla] += 1 else: cla_dict[cla] = 1 cla_ind = label_dict[cla] title_seg = [i for i in line[0]] # title_seg = jieba.cut(line[0], cut_all=False) title_ind = [cla_ind] for w in title_seg: if w in stoplist: continue title_ind.append(word2ind[w]) length = len(title_ind) if length > maxLen + 1: title_ind = title_ind[0:21] if length < maxLen + 1: title_ind.extend([0] * (maxLen - length + 1)) for n in title_ind: traindataTxt.write(str(n) + ',') traindataTxt.write('\n') cla_list = sorted(cla_dict.items(), key=lambda item: item[0], reverse=True) f = open('cla_length.txt', 'w', encoding='utf_8') total = 0 for t in cla_list: a = str(t[0]) d = str(t[0]) + ' ' + str(label_dict[a]) + ' ' + str(t[1]) + '\n' total += t[1] f.write(d) f.write('total: ' + str(total)) # traindata_vec.txt # devdata_vec.txt def main(): json2txt() if __name__ == "__main__": main()