hoodiexxx's picture
Upload 7 files
2c1d053 verified
raw
history blame
4.25 kB
# -*- coding: utf_8 -*-
from tqdm import tqdm
import jieba
import random
trainFile = 'data/output.txt' # trainFile = 'data/train.txt'
trainDataVecFile = 'traindata_vec.txt'
devFile = 'data/output2.txt' # 'data/dev.txt'
devDataVecFile = 'devdata_vec.txt'
labelFile = 'data/label2.txt' # labelFile = 'data/label.txt'
stopwordFile = 'data/stopword.txt'
wordLabelFile = 'wordLabel.txt'
maxLen = 20
title_ind = [1, 2, 3, 4]
title_ind.extend([0] * 16)
def read_labelFile(file):
data = open(file, 'r', encoding='utf_8').read().split('\n')
data.remove('')
label_w2n = {}
label_n2w = {}
for line in tqdm(data, desc='read label'):
line = line.split(' ')
name_w = line[0]
name_n = int(line[1])
label_w2n[name_w] = name_n
label_n2w[name_n] = name_w
return label_w2n, label_n2w
def read_stopword(file):
data = open(file, 'r', encoding='utf_8').read().split('\n')
return data
def get_worddict(file):
datas = open(file, 'r', encoding='utf_8').read().split('\n')
datas = list(filter(None, datas))
word2ind = {}
for line in tqdm(datas, desc="get_worddict"):
line = line.split(' ')
word2ind[line[0]] = int(line[1])
ind2word = {word2ind[w]: w for w in word2ind}
return word2ind, ind2word
def json2txt():
label_dict, label_n2w = read_labelFile(labelFile)
word2ind, ind2word = get_worddict(wordLabelFile)
stoplist = read_stopword(stopwordFile)
cla_dict = {}
# train data to vec
traindataTxt = open(trainDataVecFile, 'w')
datas = open(trainFile, 'r', encoding='utf_8').readlines()
datas = list(filter(None, datas))
random.shuffle(datas)
for line in tqdm(datas, desc="traindata to vec"):
line = line.replace('\n', '').split(':')
# line = line.replace('\n','').split('\t')
cla = line[1]
# if cla in [21, 13, 9, 24, 23, 19, 14]:
# continue
if cla in cla_dict:
cla_dict[cla] += 1
else:
cla_dict[cla] = 1
cla_ind = label_dict[cla]
title_seg = ['我', '要', '下', '单']
title_seg = [i for i in line[0]]
# title_seg = jieba.cut(line[0], cut_all=False)
title_ind = [cla_ind]
for w in title_seg:
if w in stoplist:
continue
title_ind.append(word2ind[w])
length = len(title_ind)
if length > maxLen + 1:
title_ind = title_ind[0:21]
if length < maxLen + 1:
title_ind.extend([0] * (maxLen - length + 1))
for n in title_ind:
traindataTxt.write(str(n) + ',')
traindataTxt.write('\n')
# dev data to vec
traindataTxt = open(devDataVecFile, 'w')
datas = open(devFile, 'r', encoding='utf_8').readlines()
datas = list(filter(None, datas))
random.shuffle(datas)
for line in tqdm(datas, desc="dev to vec"):
line = line.replace('\n', '').split(':')
# line = line.replace('\n', '').split('\t')
cla = line[1]
# if cla in [21, 13, 9, 24, 23, 19, 14]:
# continue
if cla in cla_dict:
cla_dict[cla] += 1
else:
cla_dict[cla] = 1
cla_ind = label_dict[cla]
title_seg = [i for i in line[0]]
# title_seg = jieba.cut(line[0], cut_all=False)
title_ind = [cla_ind]
for w in title_seg:
if w in stoplist:
continue
title_ind.append(word2ind[w])
length = len(title_ind)
if length > maxLen + 1:
title_ind = title_ind[0:21]
if length < maxLen + 1:
title_ind.extend([0] * (maxLen - length + 1))
for n in title_ind:
traindataTxt.write(str(n) + ',')
traindataTxt.write('\n')
cla_list = sorted(cla_dict.items(), key=lambda item: item[0], reverse=True)
f = open('cla_length.txt', 'w', encoding='utf_8')
total = 0
for t in cla_list:
a = str(t[0])
d = str(t[0]) + ' ' + str(label_dict[a]) + ' ' + str(t[1]) + '\n'
total += t[1]
f.write(d)
f.write('total: ' + str(total))
# traindata_vec.txt
# devdata_vec.txt
def main():
json2txt()
if __name__ == "__main__":
main()