File size: 2,972 Bytes
2c1d053
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding: utf-8 -*-
'''
将训练数据使用jieba分词工具进行分词。并且剔除stopList中的词。
得到词表:
        词表的每一行的内容为:词 词的序号 词的频次
'''

import json
import jieba
from tqdm import tqdm

trainFile = 'data/output.txt'  # trainFile = 'data/train.txt'
devFile = 'data/output2.txt' #"data/dev.txt"
stopwordFile = 'data/stopword.txt'
wordLabelFile = 'wordLabel.txt'
lengthFile = 'length.txt'


def read_stopword(file):
    datas = open(file, 'r', encoding='utf_8').readlines()
    datas = [data.replace('\n', '') for data in datas]
    return datas


def main():
    worddict = {}
    stoplist = read_stopword(stopwordFile)
    len_dic = {}
    data_num = 0
    # trainFile
    datas = open(trainFile, 'r', encoding='utf_8').readlines()
    data_num += len(datas)
    datas = list(filter(None, datas))
    for line in tqdm(datas, desc='traindata word to label'):
        line = line.replace('\n', '').split(':')
        # line = line.replace('\n', '').split('\t')
        title_seg = [i for i in line[0]]
        # title_seg = jieba.cut(line[0], cut_all=False)
        length = 0
        for w in title_seg:
            if w in stoplist:
                continue
            length += 1
            if w in worddict:
                worddict[w] += 1
            else:
                worddict[w] = 1
        if length in len_dic:
            len_dic[length] += 1
        else:
            len_dic[length] = 1

    # devFile
    datas = open(devFile, 'r', encoding='utf_8').readlines()
    datas = list(filter(None, datas))
    data_num += len(datas)
    for line in tqdm(datas, desc='devdata word to label'):
        line = line.replace('\n', '').split(':')
        # line = line.replace('\n', '').split('\t')
        title_seg = [i for i in line[0]]
        # title_seg = jieba.cut(line[0], cut_all=False)
        length = 0
        for w in title_seg:
            if w in stoplist:
                continue
            length += 1
            if w in worddict:
                worddict[w] += 1
            else:
                worddict[w] = 1
        if length in len_dic:
            len_dic[length] += 1
        else:
            len_dic[length] = 1

    wordlist = sorted(worddict.items(), key=lambda item: item[1], reverse=True)
    f = open(wordLabelFile, 'w', encoding='utf_8')
    # ind = 0
    ind = 1
    for t in wordlist:
        d = t[0] + ' ' + str(ind) + ' ' + str(t[1]) + '\n'
        ind += 1
        f.write(d)

    for k, v in len_dic.items():
        len_dic[k] = round(v * 1.0 / data_num, 3)  # calculate the probability of each length and round it to 3 decimal places
    len_list = sorted(len_dic.items(), key=lambda item: item[0], reverse=True)
    f = open(lengthFile, 'w', encoding='utf_8')
    for t in len_list:
        d = str(t[0]) + ' ' + str(t[1]) + '\n'
        f.write(d)


# lengthFile = 'length.txt'
# wordLabelFile = 'wordLabel.txt'
if __name__ == "__main__":
    main()