File size: 2,411 Bytes
82515cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# -*- coding:utf-8 _*-
"""
@author:quincy qiang
@license: Apache Licence
@file: chinese_t2s.py.py
@time: 2023/04/19
@contact: yanqiangmiffy@gamil.com
@software: PyCharm
@description: coding..
"""
import sys
import os
import opencc
from optparse import OptionParser


class T2S(object):
    def __init__(self, infile, outfile):
        self.infile = infile
        self.outfile = outfile
        self.cc = opencc.OpenCC('t2s')
        self.t_corpus = []
        self.s_corpus = []
        self.read(self.infile)
        self.t2s()
        self.write(self.s_corpus, self.outfile)

    def read(self, path):
        print(path)
        if os.path.isfile(path) is False:
            print("path is not a file")
            exit()
        now_line = 0
        with open(path, encoding="UTF-8") as f:
            for line in f:
                now_line += 1
                line = line.replace("\n", "").replace("\t", "")
                self.t_corpus.append(line)
        print("read finished")

    def t2s(self):
        now_line = 0
        all_line = len(self.t_corpus)
        for line in self.t_corpus:
            now_line += 1
            if now_line % 1000 == 0:
                sys.stdout.write("\rhandling with the {} line, all {} lines.".format(now_line, all_line))
            self.s_corpus.append(self.cc.convert(line))
        sys.stdout.write("\rhandling with the {} line, all {} lines.".format(now_line, all_line))
        print("\nhandling finished")

    def write(self, list, path):
        print("writing now......")
        if os.path.exists(path):
            os.remove(path)
        file = open(path, encoding="UTF-8", mode="w")
        for line in list:
            file.writelines(line + "\n")
        file.close()
        print("writing finished.")


if __name__ == "__main__":
    print("Traditional Chinese to Simplified Chinese")
    # input = "./wiki_zh_10.txt"
    # output = "wiki_zh_10_sim.txt"
    # T2S(infile=input, outfile=output)

    parser = OptionParser()
    parser.add_option("--input", dest="input", default="", help="traditional file")
    parser.add_option("--output", dest="output", default="", help="simplified file")
    (options, args) = parser.parse_args()

    input = options.input
    output = options.output

    try:
        T2S(infile=input, outfile=output)
        print("All Finished.")
    except Exception as err:
        print(err)