|
|
|
''' |
|
将训练数据使用jieba分词工具进行分词。并且剔除stopList中的词。 |
|
得到词表: |
|
词表的每一行的内容为:词 词的序号 词的频次 |
|
''' |
|
|
|
import json |
|
import jieba |
|
from tqdm import tqdm |
|
|
|
trainFile = 'data/output.txt' |
|
devFile = 'data/output2.txt' |
|
stopwordFile = 'data/stopword.txt' |
|
wordLabelFile = 'wordLabel.txt' |
|
lengthFile = 'length.txt' |
|
|
|
|
|
def read_stopword(file): |
|
datas = open(file, 'r', encoding='utf_8').readlines() |
|
datas = [data.replace('\n', '') for data in datas] |
|
return datas |
|
|
|
|
|
def main(): |
|
worddict = {} |
|
stoplist = read_stopword(stopwordFile) |
|
len_dic = {} |
|
data_num = 0 |
|
|
|
datas = open(trainFile, 'r', encoding='utf_8').readlines() |
|
data_num += len(datas) |
|
datas = list(filter(None, datas)) |
|
for line in tqdm(datas, desc='traindata word to label'): |
|
line = line.replace('\n', '').split(':') |
|
|
|
title_seg = [i for i in line[0]] |
|
|
|
length = 0 |
|
for w in title_seg: |
|
if w in stoplist: |
|
continue |
|
length += 1 |
|
if w in worddict: |
|
worddict[w] += 1 |
|
else: |
|
worddict[w] = 1 |
|
if length in len_dic: |
|
len_dic[length] += 1 |
|
else: |
|
len_dic[length] = 1 |
|
|
|
|
|
datas = open(devFile, 'r', encoding='utf_8').readlines() |
|
datas = list(filter(None, datas)) |
|
data_num += len(datas) |
|
for line in tqdm(datas, desc='devdata word to label'): |
|
line = line.replace('\n', '').split(':') |
|
|
|
title_seg = [i for i in line[0]] |
|
|
|
length = 0 |
|
for w in title_seg: |
|
if w in stoplist: |
|
continue |
|
length += 1 |
|
if w in worddict: |
|
worddict[w] += 1 |
|
else: |
|
worddict[w] = 1 |
|
if length in len_dic: |
|
len_dic[length] += 1 |
|
else: |
|
len_dic[length] = 1 |
|
|
|
wordlist = sorted(worddict.items(), key=lambda item: item[1], reverse=True) |
|
f = open(wordLabelFile, 'w', encoding='utf_8') |
|
|
|
ind = 1 |
|
for t in wordlist: |
|
d = t[0] + ' ' + str(ind) + ' ' + str(t[1]) + '\n' |
|
ind += 1 |
|
f.write(d) |
|
|
|
for k, v in len_dic.items(): |
|
len_dic[k] = round(v * 1.0 / data_num, 3) |
|
len_list = sorted(len_dic.items(), key=lambda item: item[0], reverse=True) |
|
f = open(lengthFile, 'w', encoding='utf_8') |
|
for t in len_list: |
|
d = str(t[0]) + ' ' + str(t[1]) + '\n' |
|
f.write(d) |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|