NgalNgal commited on
Commit
06ec039
1 Parent(s): 7416a36

Upload myseg.py

Browse files
Files changed (1) hide show
  1. myseg.py +50 -0
myseg.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ ### by Chenchen Ding, @ NICT, 2019/04
3
+ ### for python 2.x: tested with python 2.6 and 2.7
4
+ ### for python 3.x: unichr -> chr, using the "python 3.x" lines
5
+
6
+ ### usage:
7
+ ### myseg.py < MY-ORIGINAL > MY-UNIT
8
+
9
+ import sys, os, codecs
10
+ ### for python 2.x
11
+ #sys.stdin = codecs.getreader('utf-8')(sys.stdin)
12
+ #sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
13
+ #sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
14
+ ### for python 3.x
15
+ sys.stdin = codecs.getreader('utf-8')(sys.stdin.buffer)
16
+ sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer)
17
+ sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer)
18
+
19
+ X, T, STACK = chr (0x103a), chr (0x1037), chr (0x1039)
20
+ TX, XT = T + X, X + T
21
+
22
+ DEP = set ([chr (0x102b+x) for x in range (8)])
23
+ DEP|= set ([chr (0x1036+x) for x in range (3)])
24
+ DEP|= set ([chr (0x103b+x) for x in range (4)])
25
+ DEP.add (X)
26
+
27
+ def seg (m) :
28
+ m = m.replace (chr (0x200c), ' ')
29
+ m = m.replace (TX, XT)
30
+ m = list (''.join (m.lower ().strip ().split ()))
31
+ ### generate basic units
32
+ m.reverse ()
33
+ for i in range (len (m)-1) :
34
+ if m [i][0] in DEP : m [i+1] += m [i]; m [i] = ''
35
+ m.reverse ()
36
+ ### attach asat units (len < 4 to avoid au)
37
+ m = ' '.join (m).split ()
38
+ for i in range (1, len (m)) :
39
+ if X in m [i] and len (m [i]) < 4 : m [i-1] += m [i]; m [i] = ''
40
+ ### glue stacked units
41
+ m = ' '.join (m).split ()
42
+ m = ' '.join (m).replace (' '+STACK+' ', STACK)
43
+ return m
44
+
45
+ def main () :
46
+ try :
47
+ for m in sys.stdin : sys.stdout.write (seg (m) + "\n")
48
+ except : pass
49
+
50
+ if __name__ == "__main__" : main ()