File size: 637 Bytes
e50fe35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import sys
import codecs

def clean_vocab(in_vocab_fname, out_vocab_fname):
    with codecs.open(in_vocab_fname, "r", encoding="utf-8") as infile, codecs.open(
        out_vocab_fname, "w", encoding="utf-8"
    ) as outfile:
        for i, line in enumerate(infile):
            fields = line.strip("\r\n ").split(" ")
            if len(fields) == 2:
                outfile.write(line)
            if len(fields) != 2:
                print("{}: {}".format(i, line.strip()))
                for c in line:
                    print("{}:{}".format(c, hex(ord(c))))


if __name__ == "__main__":
    clean_vocab(sys.argv[1], sys.argv[2])