File size: 1,520 Bytes
06ec039
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/python
### by Chenchen Ding, @ NICT, 2019/04
### for python 2.x: tested with python 2.6 and 2.7
### for python 3.x: unichr -> chr, using the "python 3.x" lines

### usage:
### 	myseg.py < MY-ORIGINAL > MY-UNIT 

import sys, os, codecs
### for python 2.x
#sys.stdin  = codecs.getreader('utf-8')(sys.stdin)
#sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
#sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
### for python 3.x
sys.stdin  = codecs.getreader('utf-8')(sys.stdin.buffer)
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer)
sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer)

X, T, STACK = chr (0x103a), chr (0x1037), chr (0x1039)
TX, XT = T + X, X + T

DEP = set ([chr (0x102b+x) for x in range (8)])
DEP|= set ([chr (0x1036+x) for x in range (3)])
DEP|= set ([chr (0x103b+x) for x in range (4)])
DEP.add (X)

def seg (m) :
	m = m.replace (chr (0x200c), ' ')
	m = m.replace (TX, XT)
	m = list (''.join (m.lower ().strip ().split ()))
	### generate basic units
	m.reverse ()
	for i in range (len (m)-1) : 
		if m [i][0] in DEP : m [i+1] += m [i]; m [i] = ''
	m.reverse ()
	### attach asat units (len < 4 to avoid au)
	m = ' '.join (m).split ()
	for i in range (1, len (m)) :
		if X in m [i] and len (m [i]) < 4 : m [i-1] += m [i]; m [i] = ''
	### glue stacked units
	m = ' '.join (m).split ()
	m = ' '.join (m).replace (' '+STACK+' ', STACK)
	return m

def main () :
	try :
		for m in sys.stdin : sys.stdout.write (seg (m) + "\n")
	except : pass

if __name__ == "__main__" : main ()