Upload myseg.py
Browse files
myseg.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python
|
2 |
+
### by Chenchen Ding, @ NICT, 2019/04
|
3 |
+
### for python 2.x: tested with python 2.6 and 2.7
|
4 |
+
### for python 3.x: unichr -> chr, using the "python 3.x" lines
|
5 |
+
|
6 |
+
### usage:
|
7 |
+
### myseg.py < MY-ORIGINAL > MY-UNIT
|
8 |
+
|
9 |
+
import sys, os, codecs
|
10 |
+
### for python 2.x
|
11 |
+
#sys.stdin = codecs.getreader('utf-8')(sys.stdin)
|
12 |
+
#sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
|
13 |
+
#sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
|
14 |
+
### for python 3.x
|
15 |
+
sys.stdin = codecs.getreader('utf-8')(sys.stdin.buffer)
|
16 |
+
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer)
|
17 |
+
sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer)
|
18 |
+
|
19 |
+
X, T, STACK = chr (0x103a), chr (0x1037), chr (0x1039)
|
20 |
+
TX, XT = T + X, X + T
|
21 |
+
|
22 |
+
DEP = set ([chr (0x102b+x) for x in range (8)])
|
23 |
+
DEP|= set ([chr (0x1036+x) for x in range (3)])
|
24 |
+
DEP|= set ([chr (0x103b+x) for x in range (4)])
|
25 |
+
DEP.add (X)
|
26 |
+
|
27 |
+
def seg (m) :
|
28 |
+
m = m.replace (chr (0x200c), ' ')
|
29 |
+
m = m.replace (TX, XT)
|
30 |
+
m = list (''.join (m.lower ().strip ().split ()))
|
31 |
+
### generate basic units
|
32 |
+
m.reverse ()
|
33 |
+
for i in range (len (m)-1) :
|
34 |
+
if m [i][0] in DEP : m [i+1] += m [i]; m [i] = ''
|
35 |
+
m.reverse ()
|
36 |
+
### attach asat units (len < 4 to avoid au)
|
37 |
+
m = ' '.join (m).split ()
|
38 |
+
for i in range (1, len (m)) :
|
39 |
+
if X in m [i] and len (m [i]) < 4 : m [i-1] += m [i]; m [i] = ''
|
40 |
+
### glue stacked units
|
41 |
+
m = ' '.join (m).split ()
|
42 |
+
m = ' '.join (m).replace (' '+STACK+' ', STACK)
|
43 |
+
return m
|
44 |
+
|
45 |
+
def main () :
|
46 |
+
try :
|
47 |
+
for m in sys.stdin : sys.stdout.write (seg (m) + "\n")
|
48 |
+
except : pass
|
49 |
+
|
50 |
+
if __name__ == "__main__" : main ()
|