Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	requirement
Browse files- LangSegment/LangSegment.py +1068 -0
 - LangSegment/__init__.py +9 -0
 - LangSegment/utils/__init__.py +0 -0
 - LangSegment/utils/num.py +327 -0
 - requirements.txt +1 -1
 
    	
        LangSegment/LangSegment.py
    ADDED
    
    | 
         @@ -0,0 +1,1068 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            """
         
     | 
| 2 | 
         
            +
            This file bundles language identification functions.
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            Modifications (fork): Copyright (c) 2021, Adrien Barbaresi.
         
     | 
| 5 | 
         
            +
             
     | 
| 6 | 
         
            +
            Original code: Copyright (c) 2011 Marco Lui <saffsd@gmail.com>.
         
     | 
| 7 | 
         
            +
            Based on research by Marco Lui and Tim Baldwin.
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            See LICENSE file for more info.
         
     | 
| 10 | 
         
            +
            https://github.com/adbar/py3langid
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
            Projects:
         
     | 
| 13 | 
         
            +
            https://github.com/juntaosun/LangSegment
         
     | 
| 14 | 
         
            +
            """
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            import os
         
     | 
| 17 | 
         
            +
            import re
         
     | 
| 18 | 
         
            +
            import sys
         
     | 
| 19 | 
         
            +
            import numpy as np
         
     | 
| 20 | 
         
            +
            from collections import Counter
         
     | 
| 21 | 
         
            +
            from collections import defaultdict
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            # import langid
         
     | 
| 24 | 
         
            +
            # import py3langid as langid
         
     | 
| 25 | 
         
            +
            # pip install py3langid==0.2.2
         
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
            # 启用语言预测概率归一化,概率预测的分数。因此,实现重新规范化 产生 0-1 范围内的输出。
         
     | 
| 28 | 
         
            +
            # langid disables probability normalization by default. For command-line usages of , it can be enabled by passing the flag. 
         
     | 
| 29 | 
         
            +
            # For probability normalization in library use, the user must instantiate their own . An example of such usage is as follows:
         
     | 
| 30 | 
         
            +
            from py3langid.langid import LanguageIdentifier, MODEL_FILE
         
     | 
| 31 | 
         
            +
            langid = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True)
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
            # Digital processing
         
     | 
| 34 | 
         
            +
            try:from LangSegment.utils.num import num2str
         
     | 
| 35 | 
         
            +
            except ImportError:
         
     | 
| 36 | 
         
            +
                try:from utils.num import num2str
         
     | 
| 37 | 
         
            +
                except ImportError as e:
         
     | 
| 38 | 
         
            +
                    raise e
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
            # -----------------------------------
         
     | 
| 41 | 
         
            +
            # 更新日志:新版本分词更加精准。
         
     | 
| 42 | 
         
            +
            # Changelog: The new version of the word segmentation is more accurate.
         
     | 
| 43 | 
         
            +
            # チェンジログ:新しいバージョンの単語セグメンテーションはより正確です。
         
     | 
| 44 | 
         
            +
            # Changelog: 분할이라는 단어의 새로운 버전이 더 정확합니다.
         
     | 
| 45 | 
         
            +
            # -----------------------------------
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
            +
             
     | 
| 48 | 
         
            +
            # Word segmentation function: 
         
     | 
| 49 | 
         
            +
            # automatically identify and split the words (Chinese/English/Japanese/Korean) in the article or sentence according to different languages, 
         
     | 
| 50 | 
         
            +
            # making it more suitable for TTS processing.
         
     | 
| 51 | 
         
            +
            # This code is designed for front-end text multi-lingual mixed annotation distinction, multi-language mixed training and inference of various TTS projects.
         
     | 
| 52 | 
         
            +
            # This processing result is mainly for (Chinese = zh, Japanese = ja, English = en, Korean = ko), and can actually support up to 97 different language mixing processing.
         
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
            +
            #===========================================================================================================
         
     | 
| 55 | 
         
            +
            #分かち書き機能:文章や文章の中の例えば(中国語/英語/日本語/韓国語)を、異なる言語で自動的に認識して分割し、TTS処理により適したものにします。
         
     | 
| 56 | 
         
            +
            #このコードは、さまざまなTTSプロジェクトのフロントエンドテキストの多言語混合注釈区別、多言語混合トレーニング、および推論のために特別に作成されています。
         
     | 
| 57 | 
         
            +
            #===========================================================================================================
         
     | 
| 58 | 
         
            +
            #(1)自動分詞:「韓国語では何を読むのですかあなたの体育の先生は誰ですか?今回の発表会では、iPhone 15シリーズの4機種が登場しました」
         
     | 
| 59 | 
         
            +
            #(2)手动分词:“あなたの名前は<ja>佐々木ですか?<ja>ですか?”
         
     | 
| 60 | 
         
            +
            #この処理結果は主に(中国語=ja、日本語=ja、英語=en、韓国語=ko)を対象としており、実際には最大97の異なる言語の混合処理をサポートできます。
         
     | 
| 61 | 
         
            +
            #===========================================================================================================
         
     | 
| 62 | 
         
            +
             
     | 
| 63 | 
         
            +
            #===========================================================================================================
         
     | 
| 64 | 
         
            +
            # 단어 분할 기능: 기사 또는 문장에서 단어(중국어/영어/일본어/한국어)를 다른 언어에 따라 자동으로 식별하고 분할하여 TTS 처리에 더 적합합니다.
         
     | 
| 65 | 
         
            +
            # 이 코드는 프런트 엔드 텍스트 다국어 혼합 주석 분화, 다국어 혼합 교육 및 다양한 TTS 프로젝트의 추론을 위해 설계되었습니다.
         
     | 
| 66 | 
         
            +
            #===========================================================================================================
         
     | 
| 67 | 
         
            +
            # (1) 자동 단어 분할: "한국어로 무엇을 읽습니까? 스포츠 씨? 이 컨퍼런스는 4개의 iPhone 15 시리즈 모델을 제공합니다."
         
     | 
| 68 | 
         
            +
            # (2) 수동 참여: "이름이 <ja>Saki입니까? <ja>?"
         
     | 
| 69 | 
         
            +
            # 이 처리 결과는 주로 (중국어 = zh, 일본어 = ja, 영어 = en, 한국어 = ko)를 위한 것이며 실제로 혼합 처리를 위해 최대 97개의 언어를 지원합니다.
         
     | 
| 70 | 
         
            +
            #===========================================================================================================
         
     | 
| 71 | 
         
            +
             
     | 
| 72 | 
         
            +
            # ===========================================================================================================
         
     | 
| 73 | 
         
            +
            # 分词功能:将文章或句子里的例如(中/英/日/韩),按不同语言自动识别并拆分,让它更适合TTS处理。
         
     | 
| 74 | 
         
            +
            # 本代码专为各种 TTS 项目的前端文本多语种混合标注区分,多语言混合训练和推理而编写。
         
     | 
| 75 | 
         
            +
            # ===========================================================================================================
         
     | 
| 76 | 
         
            +
            # (1)自动分词:“韩语中的오빠���什么呢?あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型”
         
     | 
| 77 | 
         
            +
            # (2)手动分词:“你的名字叫<ja>佐々木?<ja>吗?”
         
     | 
| 78 | 
         
            +
            # 本处理结果主要针对(中文=zh , 日文=ja , 英文=en , 韩语=ko), 实际上可支持多达 97 种不同的语言混合处理。
         
     | 
| 79 | 
         
            +
            # ===========================================================================================================
         
     | 
| 80 | 
         
            +
             
     | 
| 81 | 
         
            +
             
     | 
| 82 | 
         
            +
            # 手动分词标签规范:<语言标签>文本内容</语言标签>
         
     | 
| 83 | 
         
            +
            # 수동 단어 분할 태그 사양: <언어 태그> 텍스트 내용</언어 태그>
         
     | 
| 84 | 
         
            +
            # Manual word segmentation tag specification: <language tags> text content </language tags>
         
     | 
| 85 | 
         
            +
            # 手動分詞タグ仕様:<言語タグ>テキスト内容</言語タグ>
         
     | 
| 86 | 
         
            +
            # ===========================================================================================================
         
     | 
| 87 | 
         
            +
            # For manual word segmentation, labels need to appear in pairs, such as:
         
     | 
| 88 | 
         
            +
            # 如需手动分词,标签需要成对出现,例如:“<ja>佐々木<ja>”  或者  “<ja>佐々木</ja>”
         
     | 
| 89 | 
         
            +
            # 错误示范:“你的名字叫<ja>佐々木。” 此句子中出现的单个<ja>标签将被忽略,不会处理。
         
     | 
| 90 | 
         
            +
            # Error demonstration: "Your name is <ja>佐々木。" Single <ja> tags that appear in this sentence will be ignored and will not be processed.
         
     | 
| 91 | 
         
            +
            # ===========================================================================================================
         
     | 
| 92 | 
         
            +
             
     | 
| 93 | 
         
            +
             
     | 
| 94 | 
         
            +
            # ===========================================================================================================
         
     | 
| 95 | 
         
            +
            # 语音合成标记语言 SSML , 这里只支持它的标签(非 XML)Speech Synthesis Markup Language SSML, only its tags are supported here (not XML)
         
     | 
| 96 | 
         
            +
            # 想支持更多的 SSML 标签?欢迎 PR! Want to support more SSML tags? PRs are welcome!
         
     | 
| 97 | 
         
            +
            # 说明:除了中文以外,它也可改造成支持多语种 SSML ,不仅仅是中文。
         
     | 
| 98 | 
         
            +
            # Note: In addition to Chinese, it can also be modified to support multi-language SSML, not just Chinese.
         
     | 
| 99 | 
         
            +
            # ===========================================================================================================
         
     | 
| 100 | 
         
            +
            # 中文实现:Chinese implementation:
         
     | 
| 101 | 
         
            +
            # 【SSML】<number>=中文大写数字读法(单字)
         
     | 
| 102 | 
         
            +
            # 【SSML】<telephone>=数字转成中文电话号码大写汉字(单字)
         
     | 
| 103 | 
         
            +
            # 【SSML】<currency>=按金额发音。
         
     | 
| 104 | 
         
            +
            # 【SSML】<date>=按日期发音。支持 2024年08月24, 2024/8/24, 2024-08, 08-24, 24 等输入。
         
     | 
| 105 | 
         
            +
            # ===========================================================================================================
         
     | 
| 106 | 
         
            +
            class LangSSML:
         
     | 
| 107 | 
         
            +
                
         
     | 
| 108 | 
         
            +
                # 纯数字
         
     | 
| 109 | 
         
            +
                _zh_numerals_number = {
         
     | 
| 110 | 
         
            +
                        '0': '零',
         
     | 
| 111 | 
         
            +
                        '1': '一',
         
     | 
| 112 | 
         
            +
                        '2': '二',
         
     | 
| 113 | 
         
            +
                        '3': '三',
         
     | 
| 114 | 
         
            +
                        '4': '四',
         
     | 
| 115 | 
         
            +
                        '5': '五',
         
     | 
| 116 | 
         
            +
                        '6': '六',
         
     | 
| 117 | 
         
            +
                        '7': '七',
         
     | 
| 118 | 
         
            +
                        '8': '八',
         
     | 
| 119 | 
         
            +
                        '9': '九'
         
     | 
| 120 | 
         
            +
                    }
         
     | 
| 121 | 
         
            +
                
         
     | 
| 122 | 
         
            +
                
         
     | 
| 123 | 
         
            +
                # 将2024/8/24, 2024-08, 08-24, 24 标准化“年月日”
         
     | 
| 124 | 
         
            +
                # Standardize 2024/8/24, 2024-08, 08-24, 24 to "year-month-day"
         
     | 
| 125 | 
         
            +
                def _format_chinese_data(date_str:str):
         
     | 
| 126 | 
         
            +
                    # 处理日期格式
         
     | 
| 127 | 
         
            +
                    input_date = date_str
         
     | 
| 128 | 
         
            +
                    if date_str is None or date_str.strip() == "":return ""
         
     | 
| 129 | 
         
            +
                    date_str = re.sub(r"[\/\._|年|月]","-",date_str)
         
     | 
| 130 | 
         
            +
                    date_str = re.sub(r"日",r"",date_str)
         
     | 
| 131 | 
         
            +
                    date_arrs = date_str.split(' ')
         
     | 
| 132 | 
         
            +
                    if len(date_arrs) == 1 and ":" in date_arrs[0]:
         
     | 
| 133 | 
         
            +
                        time_str = date_arrs[0]
         
     | 
| 134 | 
         
            +
                        date_arrs = []
         
     | 
| 135 | 
         
            +
                    else:
         
     | 
| 136 | 
         
            +
                        time_str = date_arrs[1] if len(date_arrs) >=2 else ""
         
     | 
| 137 | 
         
            +
                    def nonZero(num,cn,func=None):
         
     | 
| 138 | 
         
            +
                        if func is not None:num=func(num)
         
     | 
| 139 | 
         
            +
                        return f"{num}{cn}" if num is not None and num != "" and num != "0" else ""
         
     | 
| 140 | 
         
            +
                    f_number = LangSSML.to_chinese_number
         
     | 
| 141 | 
         
            +
                    f_currency = LangSSML.to_chinese_currency
         
     | 
| 142 | 
         
            +
                    # year, month, day
         
     | 
| 143 | 
         
            +
                    year_month_day = ""
         
     | 
| 144 | 
         
            +
                    if len(date_arrs) > 0:
         
     | 
| 145 | 
         
            +
                        year, month, day = "","",""
         
     | 
| 146 | 
         
            +
                        parts = date_arrs[0].split('-')
         
     | 
| 147 | 
         
            +
                        if len(parts) == 3:  # 格式为 YYYY-MM-DD
         
     | 
| 148 | 
         
            +
                            year, month, day = parts
         
     | 
| 149 | 
         
            +
                        elif len(parts) == 2:  # 格式为 MM-DD 或 YYYY-MM
         
     | 
| 150 | 
         
            +
                            if len(parts[0]) == 4:  # 年-月
         
     | 
| 151 | 
         
            +
                                year, month = parts
         
     | 
| 152 | 
         
            +
                            else:month, day = parts # 月-日
         
     | 
| 153 | 
         
            +
                        elif len(parts[0]) > 0:  # 仅有月-日或年
         
     | 
| 154 | 
         
            +
                            if len(parts[0]) == 4:
         
     | 
| 155 | 
         
            +
                                year = parts[0]
         
     | 
| 156 | 
         
            +
                            else:day = parts[0]
         
     | 
| 157 | 
         
            +
                        year,month,day = nonZero(year,"年",f_number),nonZero(month,"月",f_currency),nonZero(day,"日",f_currency)
         
     | 
| 158 | 
         
            +
                        year_month_day = re.sub(r"([年|月|日])+",r"\1",f"{year}{month}{day}")
         
     | 
| 159 | 
         
            +
                    # hours, minutes, seconds
         
     | 
| 160 | 
         
            +
                    time_str = re.sub(r"[\/\.\-:_]",":",time_str)
         
     | 
| 161 | 
         
            +
                    time_arrs = time_str.split(":")
         
     | 
| 162 | 
         
            +
                    hours, minutes, seconds = "","",""
         
     | 
| 163 | 
         
            +
                    if len(time_arrs) == 3: # H/M/S
         
     | 
| 164 | 
         
            +
                        hours, minutes, seconds = time_arrs
         
     | 
| 165 | 
         
            +
                    elif len(time_arrs) == 2:# H/M
         
     | 
| 166 | 
         
            +
                        hours, minutes = time_arrs
         
     | 
| 167 | 
         
            +
                    elif len(time_arrs[0]) > 0:hours = f'{time_arrs[0]}点'  # H
         
     | 
| 168 | 
         
            +
                    if len(time_arrs) > 1:
         
     | 
| 169 | 
         
            +
                        hours, minutes, seconds = nonZero(hours,"点",f_currency),nonZero(minutes,"分",f_currency),nonZero(seconds,"秒",f_currency)
         
     | 
| 170 | 
         
            +
                    hours_minutes_seconds = re.sub(r"([点|分|秒])+",r"\1",f"{hours}{minutes}{seconds}")
         
     | 
| 171 | 
         
            +
                    output_date = f"{year_month_day}{hours_minutes_seconds}"
         
     | 
| 172 | 
         
            +
                    return output_date
         
     | 
| 173 | 
         
            +
                
         
     | 
| 174 | 
         
            +
                # 【SSML】number=中文大写数字读法(单字)
         
     | 
| 175 | 
         
            +
                # Chinese Numbers(single word)
         
     | 
| 176 | 
         
            +
                def to_chinese_number(num:str):
         
     | 
| 177 | 
         
            +
                    pattern = r'(\d+)'
         
     | 
| 178 | 
         
            +
                    zh_numerals = LangSSML._zh_numerals_number
         
     | 
| 179 | 
         
            +
                    arrs = re.split(pattern, num)
         
     | 
| 180 | 
         
            +
                    output = ""
         
     | 
| 181 | 
         
            +
                    for item in arrs:
         
     | 
| 182 | 
         
            +
                        if re.match(pattern,item):
         
     | 
| 183 | 
         
            +
                            output += ''.join(zh_numerals[digit] if digit in zh_numerals else "" for digit in str(item))
         
     | 
| 184 | 
         
            +
                        else:output += item
         
     | 
| 185 | 
         
            +
                    output = output.replace(".","点")
         
     | 
| 186 | 
         
            +
                    return output
         
     | 
| 187 | 
         
            +
                
         
     | 
| 188 | 
         
            +
                # 【SSML】telephone=数字转成中文电话号码大写汉字(单字)
         
     | 
| 189 | 
         
            +
                # Convert numbers to Chinese phone numbers in uppercase Chinese characters(single word)
         
     | 
| 190 | 
         
            +
                def to_chinese_telephone(num:str):
         
     | 
| 191 | 
         
            +
                    output = LangSSML.to_chinese_number(num.replace("+86","")) # zh +86
         
     | 
| 192 | 
         
            +
                    output = output.replace("一","幺")
         
     | 
| 193 | 
         
            +
                    return output
         
     | 
| 194 | 
         
            +
                
         
     | 
| 195 | 
         
            +
                # 【SSML】currency=按金额发音。
         
     | 
| 196 | 
         
            +
                # Digital processing from GPT_SoVITS num.py (thanks)
         
     | 
| 197 | 
         
            +
                def to_chinese_currency(num:str):
         
     | 
| 198 | 
         
            +
                    pattern = r'(\d+)'
         
     | 
| 199 | 
         
            +
                    arrs = re.split(pattern, num)
         
     | 
| 200 | 
         
            +
                    output = ""
         
     | 
| 201 | 
         
            +
                    for item in arrs:
         
     | 
| 202 | 
         
            +
                        if re.match(pattern,item):
         
     | 
| 203 | 
         
            +
                            output += num2str(item)
         
     | 
| 204 | 
         
            +
                        else:output += item
         
     | 
| 205 | 
         
            +
                    output = output.replace(".","点")
         
     | 
| 206 | 
         
            +
                    return output
         
     | 
| 207 | 
         
            +
                
         
     | 
| 208 | 
         
            +
                # 【SSML】date=按日期发音。支持 2024年08月24, 2024/8/24, 2024-08, 08-24, 24 等输入。
         
     | 
| 209 | 
         
            +
                def to_chinese_date(num:str):
         
     | 
| 210 | 
         
            +
                    chinese_date = LangSSML._format_chinese_data(num)
         
     | 
| 211 | 
         
            +
                    return chinese_date
         
     | 
| 212 | 
         
            +
                
         
     | 
| 213 | 
         
            +
                
         
     | 
| 214 | 
         
            +
                
         
     | 
| 215 | 
         
            +
             
     | 
| 216 | 
         
            +
            class LangSegment():
         
     | 
| 217 | 
         
            +
                
         
     | 
| 218 | 
         
            +
                _text_cache = None
         
     | 
| 219 | 
         
            +
                _text_lasts = None
         
     | 
| 220 | 
         
            +
                _text_langs = None
         
     | 
| 221 | 
         
            +
                _lang_count = None
         
     | 
| 222 | 
         
            +
                _lang_eos =   None
         
     | 
| 223 | 
         
            +
                
         
     | 
| 224 | 
         
            +
                # 可自定义语言匹配标签:カスタマイズ可能な言語対応タグ:사용자 지정 가능한 언어 일치 태그:
         
     | 
| 225 | 
         
            +
                # Customizable language matching tags: These are supported,이 표현들은 모두 지지합니다
         
     | 
| 226 | 
         
            +
                # <zh>你好<zh> , <ja>佐々木</ja> , <en>OK<en> , <ko>오빠</ko> 这些写法均支持
         
     | 
| 227 | 
         
            +
                SYMBOLS_PATTERN = r'(<([a-zA-Z|-]*)>(.*?)<\/*[a-zA-Z|-]*>)'
         
     | 
| 228 | 
         
            +
                
         
     | 
| 229 | 
         
            +
                # 语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
         
     | 
| 230 | 
         
            +
                # 언어 필터 그룹 기능을 사용하면 예약된 언어를 지정할 수 있습니다. 필터 그룹에 없는 언어는 지워집니다. TTS 텍스트에서 지원하는 언어를 원하는 대로 일치시킬 수 있습니다.
         
     | 
| 231 | 
         
            +
                # 言語フィルターグループ機能では、予約言語を指定できます。フィルターグループに含まれていない言語はクリアされます。TTS音声合成がサポートする言語を自由に組み合わせることができます。
         
     | 
| 232 | 
         
            +
                # The language filter group function allows you to specify reserved languages. 
         
     | 
| 233 | 
         
            +
                # Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.
         
     | 
| 234 | 
         
            +
                # 排名越前,优先级越高,The higher the ranking, the higher the priority,ランキングが上位になるほど、優先度が高くなります。
         
     | 
| 235 | 
         
            +
                
         
     | 
| 236 | 
         
            +
                # 系统默认过滤器。System default filter。(ISO 639-1 codes given)
         
     | 
| 237 | 
         
            +
                # ----------------------------------------------------------------------------------------------------------------------------------
         
     | 
| 238 | 
         
            +
                # "zh"中文=Chinese ,"en"英语=English ,"ja"日语=Japanese ,"ko"韩语=Korean ,"fr"法语=French ,"vi"越南语=Vietnamese , "ru"俄语=Russian
         
     | 
| 239 | 
         
            +
                # "th"泰语=Thai
         
     | 
| 240 | 
         
            +
                # ----------------------------------------------------------------------------------------------------------------------------------
         
     | 
| 241 | 
         
            +
                DEFAULT_FILTERS = ["zh", "ja", "ko", "en"]
         
     | 
| 242 | 
         
            +
                
         
     | 
| 243 | 
         
            +
                # 用户可自定义过滤器。User-defined filters
         
     | 
| 244 | 
         
            +
                Langfilters = DEFAULT_FILTERS[:] # 创建副本
         
     | 
| 245 | 
         
            +
                
         
     | 
| 246 | 
         
            +
                # 合并文本
         
     | 
| 247 | 
         
            +
                isLangMerge = True
         
     | 
| 248 | 
         
            +
                
         
     | 
| 249 | 
         
            +
                # 试验性支持:您可自定义添加:"fr"法语 , "vi"越南语。Experimental: You can customize to add: "fr" French, "vi" Vietnamese.
         
     | 
| 250 | 
         
            +
                # 请使用API启用:LangSegment.setfilters(["zh", "en", "ja", "ko", "fr", "vi" , "ru" , "th"]) # 您可自定义添加,如:"fr"法语 , "vi"越南语。
         
     | 
| 251 | 
         
            +
                
         
     | 
| 252 | 
         
            +
                # 预览版功能,自动启用或禁用,无需设置
         
     | 
| 253 | 
         
            +
                # Preview feature, automatically enabled or disabled, no settings required
         
     | 
| 254 | 
         
            +
                EnablePreview = False
         
     | 
| 255 | 
         
            +
                
         
     | 
| 256 | 
         
            +
                # 除此以外,它支持简写过滤器,只需按不同语种任意组合即可。
         
     | 
| 257 | 
         
            +
                # In addition to that, it supports abbreviation filters, allowing for any combination of different languages.
         
     | 
| 258 | 
         
            +
                # 示例:您可以任意指定多种组���,进行过滤
         
     | 
| 259 | 
         
            +
                # Example: You can specify any combination to filter
         
     | 
| 260 | 
         
            +
                
         
     | 
| 261 | 
         
            +
                # 中/日语言优先级阀值(评分范围为 0 ~ 1):评分低于设定阀值 <0.89 时,启用 filters 中的优先级。\n
         
     | 
| 262 | 
         
            +
                # 중/일본어 우선 순위 임계값(점수 범위 0-1): 점수가 설정된 임계값 <0.89보다 낮을 때 필터에서 우선 순위를 활성화합니다.
         
     | 
| 263 | 
         
            +
                # 中国語/日本語の優先度しきい値(スコア範囲0〜1):スコアが設定されたしきい値<0.89未満の場合、フィルターの優先度が有効になります。\n
         
     | 
| 264 | 
         
            +
                # Chinese and Japanese language priority threshold (score range is 0 ~ 1): The default threshold is 0.89.  \n
         
     | 
| 265 | 
         
            +
                # Only the common characters between Chinese and Japanese are processed with confidence and priority. \n
         
     | 
| 266 | 
         
            +
                LangPriorityThreshold = 0.89
         
     | 
| 267 | 
         
            +
                
         
     | 
| 268 | 
         
            +
                # Langfilters = ["zh"]              # 按中文识别
         
     | 
| 269 | 
         
            +
                # Langfilters = ["en"]              # 按英文识别
         
     | 
| 270 | 
         
            +
                # Langfilters = ["ja"]              # 按日文识别
         
     | 
| 271 | 
         
            +
                # Langfilters = ["ko"]              # 按韩文识别
         
     | 
| 272 | 
         
            +
                # Langfilters = ["zh_ja"]           # 中日混合识别
         
     | 
| 273 | 
         
            +
                # Langfilters = ["zh_en"]           # 中英混合识别
         
     | 
| 274 | 
         
            +
                # Langfilters = ["ja_en"]           # 日英混合识别
         
     | 
| 275 | 
         
            +
                # Langfilters = ["zh_ko"]           # 中韩混合识别
         
     | 
| 276 | 
         
            +
                # Langfilters = ["ja_ko"]           # 日韩混合识别
         
     | 
| 277 | 
         
            +
                # Langfilters = ["en_ko"]           # 英韩混合识别
         
     | 
| 278 | 
         
            +
                # Langfilters = ["zh_ja_en"]        # 中日英混合识别
         
     | 
| 279 | 
         
            +
                # Langfilters = ["zh_ja_en_ko"]     # 中日英韩混合识别
         
     | 
| 280 | 
         
            +
                
         
     | 
| 281 | 
         
            +
                # 更多过滤组合,请您随意。。。For more filter combinations, please feel free to......
         
     | 
| 282 | 
         
            +
                # より多くのフィルターの組み合わせ、お気軽に。。。더 많은 필터 조합을 원하시면 자유롭게 해주세요. .....
         
     | 
| 283 | 
         
            +
                
         
     | 
| 284 | 
         
            +
                # 可选保留:支持中文数字拼音格式,更方便前端实现拼音音素修改和推理,默认关闭 False 。
         
     | 
| 285 | 
         
            +
                # 开启后 True ,括号内的数字拼音格式均保留,并识别输出为:"zh"中文。
         
     | 
| 286 | 
         
            +
                keepPinyin = False 
         
     | 
| 287 | 
         
            +
                
         
     | 
| 288 | 
         
            +
                
         
     | 
| 289 | 
         
            +
                # DEFINITION
         
     | 
| 290 | 
         
            +
                PARSE_TAG = re.compile(r'(⑥\$*\d+[\d]{6,}⑥)')
         
     | 
| 291 | 
         
            +
                
         
     | 
| 292 | 
         
            +
                @staticmethod
         
     | 
| 293 | 
         
            +
                def _clears():
         
     | 
| 294 | 
         
            +
                    LangSegment._text_cache = None
         
     | 
| 295 | 
         
            +
                    LangSegment._text_lasts = None
         
     | 
| 296 | 
         
            +
                    LangSegment._text_langs = None
         
     | 
| 297 | 
         
            +
                    LangSegment._text_waits = None
         
     | 
| 298 | 
         
            +
                    LangSegment._lang_count = None
         
     | 
| 299 | 
         
            +
                    LangSegment._lang_eos   = None
         
     | 
| 300 | 
         
            +
                    pass
         
     | 
| 301 | 
         
            +
                
         
     | 
| 302 | 
         
            +
                @staticmethod
         
     | 
| 303 | 
         
            +
                def _is_english_word(word):
         
     | 
| 304 | 
         
            +
                    return bool(re.match(r'^[a-zA-Z]+$', word))
         
     | 
| 305 | 
         
            +
             
     | 
| 306 | 
         
            +
                @staticmethod
         
     | 
| 307 | 
         
            +
                def _is_chinese(word):
         
     | 
| 308 | 
         
            +
                    for char in word:
         
     | 
| 309 | 
         
            +
                        if '\u4e00' <= char <= '\u9fff':
         
     | 
| 310 | 
         
            +
                            return True
         
     | 
| 311 | 
         
            +
                    return False
         
     | 
| 312 | 
         
            +
                
         
     | 
| 313 | 
         
            +
                @staticmethod
         
     | 
| 314 | 
         
            +
                def _is_japanese_kana(word):
         
     | 
| 315 | 
         
            +
                    pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF]+')
         
     | 
| 316 | 
         
            +
                    matches = pattern.findall(word)
         
     | 
| 317 | 
         
            +
                    return len(matches) > 0
         
     | 
| 318 | 
         
            +
                
         
     | 
| 319 | 
         
            +
                @staticmethod
         
     | 
| 320 | 
         
            +
                def _insert_english_uppercase(word):
         
     | 
| 321 | 
         
            +
                    modified_text = re.sub(r'(?<!\b)([A-Z])', r' \1', word)
         
     | 
| 322 | 
         
            +
                    modified_text = modified_text.strip('-')
         
     | 
| 323 | 
         
            +
                    return modified_text + " "
         
     | 
| 324 | 
         
            +
                
         
     | 
| 325 | 
         
            +
                @staticmethod
         
     | 
| 326 | 
         
            +
                def _split_camel_case(word):
         
     | 
| 327 | 
         
            +
                    return re.sub(r'(?<!^)(?=[A-Z])', ' ', word)
         
     | 
| 328 | 
         
            +
                
         
     | 
| 329 | 
         
            +
                @staticmethod
         
     | 
| 330 | 
         
            +
                def _statistics(language, text):
         
     | 
| 331 | 
         
            +
                    # Language word statistics:
         
     | 
| 332 | 
         
            +
                    # Chinese characters usually occupy double bytes
         
     | 
| 333 | 
         
            +
                    if LangSegment._lang_count is None or not isinstance(LangSegment._lang_count, defaultdict):
         
     | 
| 334 | 
         
            +
                        LangSegment._lang_count = defaultdict(int)
         
     | 
| 335 | 
         
            +
                    lang_count = LangSegment._lang_count
         
     | 
| 336 | 
         
            +
                    if not "|" in language:
         
     | 
| 337 | 
         
            +
                        lang_count[language] += int(len(text)*2) if language == "zh" else len(text)
         
     | 
| 338 | 
         
            +
                    LangSegment._lang_count = lang_count
         
     | 
| 339 | 
         
            +
                    pass
         
     | 
| 340 | 
         
            +
                
         
     | 
| 341 | 
         
            +
                @staticmethod
         
     | 
| 342 | 
         
            +
                def _clear_text_number(text):
         
     | 
| 343 | 
         
            +
                    if text == "\n":return text,False # Keep Line Breaks
         
     | 
| 344 | 
         
            +
                    clear_text = re.sub(r'([^\w\s]+)','',re.sub(r'\n+','',text)).strip()
         
     | 
| 345 | 
         
            +
                    is_number = len(re.sub(re.compile(r'(\d+)'),'',clear_text)) == 0
         
     | 
| 346 | 
         
            +
                    return clear_text,is_number
         
     | 
| 347 | 
         
            +
                
         
     | 
| 348 | 
         
            +
                @staticmethod
         
     | 
| 349 | 
         
            +
                def _saveData(words,language:str,text:str,score:float,symbol=None):
         
     | 
| 350 | 
         
            +
                    # Pre-detection
         
     | 
| 351 | 
         
            +
                    clear_text , is_number = LangSegment._clear_text_number(text)
         
     | 
| 352 | 
         
            +
                    # Merge the same language and save the results
         
     | 
| 353 | 
         
            +
                    preData = words[-1] if len(words) > 0 else None
         
     | 
| 354 | 
         
            +
                    if symbol is not None:pass
         
     | 
| 355 | 
         
            +
                    elif preData is not None and preData["symbol"] is None:
         
     | 
| 356 | 
         
            +
                        if len(clear_text) == 0:language = preData["lang"]
         
     | 
| 357 | 
         
            +
                        elif is_number == True:language = preData["lang"]
         
     | 
| 358 | 
         
            +
                        _ , pre_is_number = LangSegment._clear_text_number(preData["text"])
         
     | 
| 359 | 
         
            +
                        if (preData["lang"] == language):
         
     | 
| 360 | 
         
            +
                            LangSegment._statistics(preData["lang"],text)
         
     | 
| 361 | 
         
            +
                            text = preData["text"] + text
         
     | 
| 362 | 
         
            +
                            preData["text"] = text
         
     | 
| 363 | 
         
            +
                            return preData
         
     | 
| 364 | 
         
            +
                        elif pre_is_number == True:
         
     | 
| 365 | 
         
            +
                            text = f'{preData["text"]}{text}'
         
     | 
| 366 | 
         
            +
                            words.pop()
         
     | 
| 367 | 
         
            +
                    elif is_number == True: 
         
     | 
| 368 | 
         
            +
                        priority_language = LangSegment._get_filters_string()[:2]
         
     | 
| 369 | 
         
            +
                        if priority_language in "ja-zh-en-ko-fr-vi":language = priority_language
         
     | 
| 370 | 
         
            +
                    data = {"lang":language,"text": text,"score":score,"symbol":symbol}
         
     | 
| 371 | 
         
            +
                    filters = LangSegment.Langfilters
         
     | 
| 372 | 
         
            +
                    if filters is None or len(filters) == 0 or "?" in language or   \
         
     | 
| 373 | 
         
            +
                        language in filters or language in filters[0] or \
         
     | 
| 374 | 
         
            +
                        filters[0] == "*" or filters[0] in "alls-mixs-autos":
         
     | 
| 375 | 
         
            +
                        words.append(data)
         
     | 
| 376 | 
         
            +
                        LangSegment._statistics(data["lang"],data["text"])
         
     | 
| 377 | 
         
            +
                    return data
         
     | 
| 378 | 
         
            +
             
     | 
| 379 | 
         
            +
                @staticmethod
         
     | 
| 380 | 
         
            +
                def _addwords(words,language,text,score,symbol=None):
         
     | 
| 381 | 
         
            +
                    if text == "\n":pass # Keep Line Breaks
         
     | 
| 382 | 
         
            +
                    elif text is None or len(text.strip()) == 0:return True
         
     | 
| 383 | 
         
            +
                    if language is None:language = ""
         
     | 
| 384 | 
         
            +
                    language = language.lower()
         
     | 
| 385 | 
         
            +
                    if language == 'en':text = LangSegment._insert_english_uppercase(text)
         
     | 
| 386 | 
         
            +
                    # text = re.sub(r'[(())]', ',' , text) # Keep it.
         
     | 
| 387 | 
         
            +
                    text_waits = LangSegment._text_waits
         
     | 
| 388 | 
         
            +
                    ispre_waits = len(text_waits)>0
         
     | 
| 389 | 
         
            +
                    preResult = text_waits.pop() if ispre_waits else None
         
     | 
| 390 | 
         
            +
                    if preResult is None:preResult = words[-1] if len(words) > 0 else None
         
     | 
| 391 | 
         
            +
                    if preResult and ("|" in preResult["lang"]):   
         
     | 
| 392 | 
         
            +
                        pre_lang = preResult["lang"]
         
     | 
| 393 | 
         
            +
                        if language in pre_lang:preResult["lang"] = language = language.split("|")[0]
         
     | 
| 394 | 
         
            +
                        else:preResult["lang"]=pre_lang.split("|")[0]
         
     | 
| 395 | 
         
            +
                        if ispre_waits:preResult = LangSegment._saveData(words,preResult["lang"],preResult["text"],preResult["score"],preResult["symbol"])
         
     | 
| 396 | 
         
            +
                    pre_lang = preResult["lang"] if preResult else None
         
     | 
| 397 | 
         
            +
                    if ("|" in language) and (pre_lang and not pre_lang in language and not "…" in language):language = language.split("|")[0]
         
     | 
| 398 | 
         
            +
                    if "|" in language:LangSegment._text_waits.append({"lang":language,"text": text,"score":score,"symbol":symbol})
         
     | 
| 399 | 
         
            +
                    else:LangSegment._saveData(words,language,text,score,symbol)
         
     | 
| 400 | 
         
            +
                    return False
         
     | 
| 401 | 
         
            +
                
         
     | 
| 402 | 
         
            +
                @staticmethod
         
     | 
| 403 | 
         
            +
                def _get_prev_data(words):
         
     | 
| 404 | 
         
            +
                    data = words[-1] if words and len(words) > 0 else None
         
     | 
| 405 | 
         
            +
                    if data:return (data["lang"] , data["text"])
         
     | 
| 406 | 
         
            +
                    return (None,"")
         
     | 
| 407 | 
         
            +
                
         
     | 
| 408 | 
         
            +
                @staticmethod
         
     | 
| 409 | 
         
            +
                def _match_ending(input , index):
         
     | 
| 410 | 
         
            +
                    if input is None or len(input) == 0:return False,None
         
     | 
| 411 | 
         
            +
                    input = re.sub(r'\s+', '', input)
         
     | 
| 412 | 
         
            +
                    if len(input) == 0 or abs(index) > len(input):return False,None
         
     | 
| 413 | 
         
            +
                    ending_pattern = re.compile(r'([「」“”‘’"\'::。.!!?.?])')
         
     | 
| 414 | 
         
            +
                    return ending_pattern.match(input[index]),input[index]
         
     | 
| 415 | 
         
            +
                
         
     | 
| 416 | 
         
            +
                @staticmethod
         
     | 
| 417 | 
         
            +
                def _cleans_text(cleans_text):
         
     | 
| 418 | 
         
            +
                    cleans_text = re.sub(r'(.*?)([^\w]+)', r'\1 ', cleans_text)
         
     | 
| 419 | 
         
            +
                    cleans_text = re.sub(r'(.)\1+', r'\1', cleans_text)
         
     | 
| 420 | 
         
            +
                    return cleans_text.strip()
         
     | 
| 421 | 
         
            +
                
         
     | 
| 422 | 
         
            +
                @staticmethod
         
     | 
| 423 | 
         
            +
                def _mean_processing(text:str):
         
     | 
| 424 | 
         
            +
                    if text is None or (text.strip()) == "":return None , 0.0
         
     | 
| 425 | 
         
            +
                    arrs = LangSegment._split_camel_case(text).split(" ")
         
     | 
| 426 | 
         
            +
                    langs = []
         
     | 
| 427 | 
         
            +
                    for t in arrs:
         
     | 
| 428 | 
         
            +
                        if len(t.strip()) <= 3:continue
         
     | 
| 429 | 
         
            +
                        language, score = langid.classify(t)
         
     | 
| 430 | 
         
            +
                        langs.append({"lang":language})
         
     | 
| 431 | 
         
            +
                    if len(langs) == 0:return None , 0.0
         
     | 
| 432 | 
         
            +
                    return Counter([item['lang'] for item in langs]).most_common(1)[0][0],1.0
         
     | 
| 433 | 
         
            +
                
         
     | 
| 434 | 
         
            +
                @staticmethod
         
     | 
| 435 | 
         
            +
                def _lang_classify(cleans_text):
         
     | 
| 436 | 
         
            +
                    language, score = langid.classify(cleans_text)
         
     | 
| 437 | 
         
            +
                    # fix: Huggingface is np.float32
         
     | 
| 438 | 
         
            +
                    if score is not None and isinstance(score, np.generic) and hasattr(score,"item"):
         
     | 
| 439 | 
         
            +
                        score = score.item()
         
     | 
| 440 | 
         
            +
                    score = round(score , 3)
         
     | 
| 441 | 
         
            +
                    return language, score
         
     | 
| 442 | 
         
            +
                
         
     | 
| 443 | 
         
            +
                @staticmethod
         
     | 
| 444 | 
         
            +
                def _get_filters_string():
         
     | 
| 445 | 
         
            +
                    filters = LangSegment.Langfilters
         
     | 
| 446 | 
         
            +
                    return "-".join(filters).lower().strip() if filters is not None else ""
         
     | 
| 447 | 
         
            +
                
         
     | 
| 448 | 
         
            +
                @staticmethod
         
     | 
| 449 | 
         
            +
                def _parse_language(words , segment):
         
     | 
| 450 | 
         
            +
                    LANG_JA = "ja"
         
     | 
| 451 | 
         
            +
                    LANG_ZH = "zh"
         
     | 
| 452 | 
         
            +
                    LANG_ZH_JA = f'{LANG_ZH}|{LANG_JA}'
         
     | 
| 453 | 
         
            +
                    LANG_JA_ZH = f'{LANG_JA}|{LANG_ZH}'
         
     | 
| 454 | 
         
            +
                    language = LANG_ZH
         
     | 
| 455 | 
         
            +
                    regex_pattern = re.compile(r'([^\w\s]+)')
         
     | 
| 456 | 
         
            +
                    lines = regex_pattern.split(segment)
         
     | 
| 457 | 
         
            +
                    lines_max = len(lines)
         
     | 
| 458 | 
         
            +
                    LANG_EOS =LangSegment._lang_eos
         
     | 
| 459 | 
         
            +
                    for index, text in enumerate(lines):
         
     | 
| 460 | 
         
            +
                        if len(text) == 0:continue
         
     | 
| 461 | 
         
            +
                        EOS = index >= (lines_max - 1)
         
     | 
| 462 | 
         
            +
                        nextId = index + 1
         
     | 
| 463 | 
         
            +
                        nextText = lines[nextId] if not EOS else ""
         
     | 
| 464 | 
         
            +
                        nextPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',nextText)).strip()) == 0
         
     | 
| 465 | 
         
            +
                        textPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',text)).strip()) == 0
         
     | 
| 466 | 
         
            +
                        if not EOS and (textPunc == True or ( len(nextText.strip()) >= 0 and nextPunc == True)):
         
     | 
| 467 | 
         
            +
                            lines[nextId] = f'{text}{nextText}'
         
     | 
| 468 | 
         
            +
                            continue
         
     | 
| 469 | 
         
            +
                        number_tags = re.compile(r'(⑥\d{6,}⑥)')
         
     | 
| 470 | 
         
            +
                        cleans_text = re.sub(number_tags, '' ,text)
         
     | 
| 471 | 
         
            +
                        cleans_text = re.sub(r'\d+', '' ,cleans_text)
         
     | 
| 472 | 
         
            +
                        cleans_text = LangSegment._cleans_text(cleans_text)
         
     | 
| 473 | 
         
            +
                        # fix:Langid's recognition of short sentences is inaccurate, and it is spliced longer.
         
     | 
| 474 | 
         
            +
                        if not EOS and len(cleans_text) <= 2:
         
     | 
| 475 | 
         
            +
                            lines[nextId] = f'{text}{nextText}'
         
     | 
| 476 | 
         
            +
                            continue
         
     | 
| 477 | 
         
            +
                        language,score = LangSegment._lang_classify(cleans_text)
         
     | 
| 478 | 
         
            +
                        prev_language , prev_text = LangSegment._get_prev_data(words)
         
     | 
| 479 | 
         
            +
                        if language != LANG_ZH and all('\u4e00' <= c <= '\u9fff' for c in re.sub(r'\s','',cleans_text)):language,score = LANG_ZH,1
         
     | 
| 480 | 
         
            +
                        if len(cleans_text) <= 5 and LangSegment._is_chinese(cleans_text):
         
     | 
| 481 | 
         
            +
                            filters_string = LangSegment._get_filters_string()
         
     | 
| 482 | 
         
            +
                            if score < LangSegment.LangPriorityThreshold and len(filters_string) > 0:
         
     | 
| 483 | 
         
            +
                                index_ja , index_zh = filters_string.find(LANG_JA) , filters_string.find(LANG_ZH)
         
     | 
| 484 | 
         
            +
                                if index_ja != -1 and index_ja < index_zh:language = LANG_JA
         
     | 
| 485 | 
         
            +
                                elif index_zh != -1 and index_zh < index_ja:language = LANG_ZH
         
     | 
| 486 | 
         
            +
                            if LangSegment._is_japanese_kana(cleans_text):language = LANG_JA
         
     | 
| 487 | 
         
            +
                            elif len(cleans_text) > 2 and score > 0.90:pass
         
     | 
| 488 | 
         
            +
                            elif EOS and LANG_EOS:language = LANG_ZH if len(cleans_text) <= 1 else language
         
     | 
| 489 | 
         
            +
                            else:
         
     | 
| 490 | 
         
            +
                                LANG_UNKNOWN = LANG_ZH_JA if language == LANG_ZH or (len(cleans_text) <=2 and prev_language == LANG_ZH) else LANG_JA_ZH
         
     | 
| 491 | 
         
            +
                                match_end,match_char = LangSegment._match_ending(text, -1)
         
     | 
| 492 | 
         
            +
                                referen = prev_language in LANG_UNKNOWN or LANG_UNKNOWN in prev_language if prev_language else False
         
     | 
| 493 | 
         
            +
                                if match_char in "。.": language = prev_language if referen and len(words) > 0 else language
         
     | 
| 494 | 
         
            +
                                else:language = f"{LANG_UNKNOWN}|…"
         
     | 
| 495 | 
         
            +
                        text,*_ = re.subn(number_tags , LangSegment._restore_number , text )
         
     | 
| 496 | 
         
            +
                        LangSegment._addwords(words,language,text,score)
         
     | 
| 497 | 
         
            +
                        pass
         
     | 
| 498 | 
         
            +
                    pass
         
     | 
| 499 | 
         
            +
                
         
     | 
| 500 | 
         
            +
                # ----------------------------------------------------------
         
     | 
| 501 | 
         
            +
                # 【SSML】中文数字处理:Chinese Number Processing (SSML support)
         
     | 
| 502 | 
         
            +
                # 这里默认都是中文,用于处理 SSML 中文标签。当然可以支持任意语言,例如:
         
     | 
| 503 | 
         
            +
                # The default here is Chinese, which is used to process SSML Chinese tags. Of course, any language can be supported, for example:
         
     | 
| 504 | 
         
            +
                # 中文电话号码:<telephone>1234567</telephone>
         
     | 
| 505 | 
         
            +
                # 中文数字号码:<number>1234567</number>
         
     | 
| 506 | 
         
            +
                @staticmethod
         
     | 
| 507 | 
         
            +
                def _process_symbol_SSML(words,data):
         
     | 
| 508 | 
         
            +
                    tag , match = data
         
     | 
| 509 | 
         
            +
                    language = SSML = match[1]
         
     | 
| 510 | 
         
            +
                    text = match[2]
         
     | 
| 511 | 
         
            +
                    score = 1.0
         
     | 
| 512 | 
         
            +
                    if SSML == "telephone":
         
     | 
| 513 | 
         
            +
                        # 中文-电话号码
         
     | 
| 514 | 
         
            +
                        language = "zh"
         
     | 
| 515 | 
         
            +
                        text = LangSSML.to_chinese_telephone(text)
         
     | 
| 516 | 
         
            +
                        pass
         
     | 
| 517 | 
         
            +
                    elif SSML == "number":
         
     | 
| 518 | 
         
            +
                        # 中文-数字读法
         
     | 
| 519 | 
         
            +
                        language = "zh"
         
     | 
| 520 | 
         
            +
                        text = LangSSML.to_chinese_number(text)
         
     | 
| 521 | 
         
            +
                        pass
         
     | 
| 522 | 
         
            +
                    elif SSML == "currency":
         
     | 
| 523 | 
         
            +
                        # 中文-按金额发音
         
     | 
| 524 | 
         
            +
                        language = "zh"
         
     | 
| 525 | 
         
            +
                        text = LangSSML.to_chinese_currency(text)
         
     | 
| 526 | 
         
            +
                        pass
         
     | 
| 527 | 
         
            +
                    elif SSML == "date":
         
     | 
| 528 | 
         
            +
                        # 中文-按金额发音
         
     | 
| 529 | 
         
            +
                        language = "zh"
         
     | 
| 530 | 
         
            +
                        text = LangSSML.to_chinese_date(text)
         
     | 
| 531 | 
         
            +
                        pass
         
     | 
| 532 | 
         
            +
                    LangSegment._addwords(words,language,text,score,SSML)
         
     | 
| 533 | 
         
            +
                    pass
         
     | 
| 534 | 
         
            +
                    
         
     | 
| 535 | 
         
            +
                # ----------------------------------------------------------
         
     | 
| 536 | 
         
            +
                
         
     | 
| 537 | 
         
            +
                @staticmethod
         
     | 
| 538 | 
         
            +
                def _restore_number(matche):
         
     | 
| 539 | 
         
            +
                    value = matche.group(0)
         
     | 
| 540 | 
         
            +
                    text_cache = LangSegment._text_cache
         
     | 
| 541 | 
         
            +
                    if value in text_cache:
         
     | 
| 542 | 
         
            +
                        process , data = text_cache[value]
         
     | 
| 543 | 
         
            +
                        tag , match = data
         
     | 
| 544 | 
         
            +
                        value = match
         
     | 
| 545 | 
         
            +
                    return value
         
     | 
| 546 | 
         
            +
                
         
     | 
| 547 | 
         
            +
                @staticmethod
         
     | 
| 548 | 
         
            +
                def _pattern_symbols(item , text):
         
     | 
| 549 | 
         
            +
                    if text is None:return text
         
     | 
| 550 | 
         
            +
                    tag , pattern , process = item
         
     | 
| 551 | 
         
            +
                    matches = pattern.findall(text)
         
     | 
| 552 | 
         
            +
                    if len(matches) == 1 and "".join(matches[0]) == text:
         
     | 
| 553 | 
         
            +
                        return text
         
     | 
| 554 | 
         
            +
                    for i , match in enumerate(matches):
         
     | 
| 555 | 
         
            +
                        key = f"⑥{tag}{i:06d}⑥"
         
     | 
| 556 | 
         
            +
                        text = re.sub(pattern , key , text , count=1)
         
     | 
| 557 | 
         
            +
                        LangSegment._text_cache[key] = (process , (tag , match))
         
     | 
| 558 | 
         
            +
                    return text
         
     | 
| 559 | 
         
            +
                
         
     | 
| 560 | 
         
            +
                @staticmethod
         
     | 
| 561 | 
         
            +
                def _process_symbol(words,data):
         
     | 
| 562 | 
         
            +
                    tag , match = data
         
     | 
| 563 | 
         
            +
                    language = match[1]
         
     | 
| 564 | 
         
            +
                    text = match[2]
         
     | 
| 565 | 
         
            +
                    score = 1.0
         
     | 
| 566 | 
         
            +
                    filters = LangSegment._get_filters_string()
         
     | 
| 567 | 
         
            +
                    if language not in filters:
         
     | 
| 568 | 
         
            +
                        LangSegment._process_symbol_SSML(words,data)
         
     | 
| 569 | 
         
            +
                    else:
         
     | 
| 570 | 
         
            +
                        LangSegment._addwords(words,language,text,score,True)
         
     | 
| 571 | 
         
            +
                    pass
         
     | 
| 572 | 
         
            +
                
         
     | 
| 573 | 
         
            +
                @staticmethod
         
     | 
| 574 | 
         
            +
                def _process_english(words,data):
         
     | 
| 575 | 
         
            +
                    tag , match = data
         
     | 
| 576 | 
         
            +
                    text = match[0]
         
     | 
| 577 | 
         
            +
                    filters = LangSegment._get_filters_string()
         
     | 
| 578 | 
         
            +
                    priority_language = filters[:2]
         
     | 
| 579 | 
         
            +
                    # Preview feature, other language segmentation processing
         
     | 
| 580 | 
         
            +
                    enablePreview = LangSegment.EnablePreview
         
     | 
| 581 | 
         
            +
                    if enablePreview == True:
         
     | 
| 582 | 
         
            +
                        # Experimental: Other language support
         
     | 
| 583 | 
         
            +
                        regex_pattern = re.compile(r'(.*?[。.??!!]+[\n]{,1})')
         
     | 
| 584 | 
         
            +
                        lines = regex_pattern.split(text)
         
     | 
| 585 | 
         
            +
                        for index , text in enumerate(lines):
         
     | 
| 586 | 
         
            +
                            if len(text.strip()) == 0:continue
         
     | 
| 587 | 
         
            +
                            cleans_text = LangSegment._cleans_text(text)
         
     | 
| 588 | 
         
            +
                            language,score = LangSegment._lang_classify(cleans_text)
         
     | 
| 589 | 
         
            +
                            if language not in filters:
         
     | 
| 590 | 
         
            +
                                language,score = LangSegment._mean_processing(cleans_text)
         
     | 
| 591 | 
         
            +
                            if language is None or score <= 0.0:continue
         
     | 
| 592 | 
         
            +
                            elif language in filters:pass # pass
         
     | 
| 593 | 
         
            +
                            elif score >= 0.95:continue # High score, but not in the filter, excluded.
         
     | 
| 594 | 
         
            +
                            elif score <= 0.15 and filters[:2] == "fr":language = priority_language
         
     | 
| 595 | 
         
            +
                            else:language = "en"
         
     | 
| 596 | 
         
            +
                            LangSegment._addwords(words,language,text,score)
         
     | 
| 597 | 
         
            +
                    else:
         
     | 
| 598 | 
         
            +
                        # Default is English
         
     | 
| 599 | 
         
            +
                        language, score = "en", 1.0
         
     | 
| 600 | 
         
            +
                        LangSegment._addwords(words,language,text,score)
         
     | 
| 601 | 
         
            +
                    pass
         
     | 
| 602 | 
         
            +
                
         
     | 
| 603 | 
         
            +
                @staticmethod
         
     | 
| 604 | 
         
            +
                def _process_Russian(words,data):
         
     | 
| 605 | 
         
            +
                    tag , match = data
         
     | 
| 606 | 
         
            +
                    text = match[0]
         
     | 
| 607 | 
         
            +
                    language = "ru"
         
     | 
| 608 | 
         
            +
                    score = 1.0
         
     | 
| 609 | 
         
            +
                    LangSegment._addwords(words,language,text,score)
         
     | 
| 610 | 
         
            +
                    pass
         
     | 
| 611 | 
         
            +
                
         
     | 
| 612 | 
         
            +
                @staticmethod
         
     | 
| 613 | 
         
            +
                def _process_Thai(words,data):
         
     | 
| 614 | 
         
            +
                    tag , match = data
         
     | 
| 615 | 
         
            +
                    text = match[0]
         
     | 
| 616 | 
         
            +
                    language = "th"
         
     | 
| 617 | 
         
            +
                    score = 1.0
         
     | 
| 618 | 
         
            +
                    LangSegment._addwords(words,language,text,score)
         
     | 
| 619 | 
         
            +
                    pass
         
     | 
| 620 | 
         
            +
                
         
     | 
| 621 | 
         
            +
                @staticmethod
         
     | 
| 622 | 
         
            +
                def _process_korean(words,data):
         
     | 
| 623 | 
         
            +
                    tag , match = data
         
     | 
| 624 | 
         
            +
                    text = match[0]
         
     | 
| 625 | 
         
            +
                    language = "ko"
         
     | 
| 626 | 
         
            +
                    score = 1.0
         
     | 
| 627 | 
         
            +
                    LangSegment._addwords(words,language,text,score)
         
     | 
| 628 | 
         
            +
                    pass
         
     | 
| 629 | 
         
            +
                
         
     | 
| 630 | 
         
            +
                @staticmethod
         
     | 
| 631 | 
         
            +
                def _process_quotes(words,data):
         
     | 
| 632 | 
         
            +
                    tag , match = data
         
     | 
| 633 | 
         
            +
                    text = "".join(match)
         
     | 
| 634 | 
         
            +
                    childs = LangSegment.PARSE_TAG.findall(text)
         
     | 
| 635 | 
         
            +
                    if len(childs) > 0:
         
     | 
| 636 | 
         
            +
                        LangSegment._process_tags(words , text , False)
         
     | 
| 637 | 
         
            +
                    else:
         
     | 
| 638 | 
         
            +
                        cleans_text = LangSegment._cleans_text(match[1])
         
     | 
| 639 | 
         
            +
                        if len(cleans_text) <= 5:
         
     | 
| 640 | 
         
            +
                            LangSegment._parse_language(words,text)
         
     | 
| 641 | 
         
            +
                        else:
         
     | 
| 642 | 
         
            +
                            language,score = LangSegment._lang_classify(cleans_text)
         
     | 
| 643 | 
         
            +
                            LangSegment._addwords(words,language,text,score)
         
     | 
| 644 | 
         
            +
                    pass
         
     | 
| 645 | 
         
            +
                
         
     | 
| 646 | 
         
            +
                    
         
     | 
| 647 | 
         
            +
                @staticmethod
         
     | 
| 648 | 
         
            +
                def _process_pinyin(words,data):
         
     | 
| 649 | 
         
            +
                    tag , match = data
         
     | 
| 650 | 
         
            +
                    text = match
         
     | 
| 651 | 
         
            +
                    language = "zh"
         
     | 
| 652 | 
         
            +
                    score = 1.0
         
     | 
| 653 | 
         
            +
                    LangSegment._addwords(words,language,text,score)
         
     | 
| 654 | 
         
            +
                    pass
         
     | 
| 655 | 
         
            +
                
         
     | 
| 656 | 
         
            +
                @staticmethod
         
     | 
| 657 | 
         
            +
                def _process_number(words,data): # "$0" process only
         
     | 
| 658 | 
         
            +
                    """
         
     | 
| 659 | 
         
            +
                    Numbers alone cannot accurately identify language.
         
     | 
| 660 | 
         
            +
                    Because numbers are universal in all languages.
         
     | 
| 661 | 
         
            +
                    So it won't be executed here, just for testing.
         
     | 
| 662 | 
         
            +
                    """
         
     | 
| 663 | 
         
            +
                    tag , match = data
         
     | 
| 664 | 
         
            +
                    language = words[0]["lang"] if len(words) > 0 else "zh"
         
     | 
| 665 | 
         
            +
                    text = match
         
     | 
| 666 | 
         
            +
                    score = 0.0
         
     | 
| 667 | 
         
            +
                    LangSegment._addwords(words,language,text,score)
         
     | 
| 668 | 
         
            +
                    pass
         
     | 
| 669 | 
         
            +
                
         
     | 
| 670 | 
         
            +
                @staticmethod
         
     | 
| 671 | 
         
            +
                def _process_tags(words , text , root_tag):
         
     | 
| 672 | 
         
            +
                    text_cache = LangSegment._text_cache
         
     | 
| 673 | 
         
            +
                    segments = re.split(LangSegment.PARSE_TAG, text)
         
     | 
| 674 | 
         
            +
                    segments_len = len(segments) - 1
         
     | 
| 675 | 
         
            +
                    for index , text in enumerate(segments):
         
     | 
| 676 | 
         
            +
                        if root_tag:LangSegment._lang_eos = index >= segments_len
         
     | 
| 677 | 
         
            +
                        if LangSegment.PARSE_TAG.match(text):
         
     | 
| 678 | 
         
            +
                            process , data = text_cache[text]
         
     | 
| 679 | 
         
            +
                            if process:process(words , data)
         
     | 
| 680 | 
         
            +
                        else:
         
     | 
| 681 | 
         
            +
                            LangSegment._parse_language(words , text)
         
     | 
| 682 | 
         
            +
                        pass
         
     | 
| 683 | 
         
            +
                    return words
         
     | 
| 684 | 
         
            +
                
         
     | 
| 685 | 
         
            +
                @staticmethod
         
     | 
| 686 | 
         
            +
                def _merge_results(words):
         
     | 
| 687 | 
         
            +
                    new_word = []
         
     | 
| 688 | 
         
            +
                    for index , cur_data in enumerate(words):
         
     | 
| 689 | 
         
            +
                        if "symbol" in cur_data:del cur_data["symbol"]
         
     | 
| 690 | 
         
            +
                        if index == 0:new_word.append(cur_data)
         
     | 
| 691 | 
         
            +
                        else:
         
     | 
| 692 | 
         
            +
                            pre_data = new_word[-1]
         
     | 
| 693 | 
         
            +
                            if cur_data["lang"] == pre_data["lang"]:
         
     | 
| 694 | 
         
            +
                                pre_data["text"] = f'{pre_data["text"]}{cur_data["text"]}'
         
     | 
| 695 | 
         
            +
                            else:new_word.append(cur_data)
         
     | 
| 696 | 
         
            +
                    return new_word
         
     | 
| 697 | 
         
            +
                
         
     | 
| 698 | 
         
            +
                @staticmethod
         
     | 
| 699 | 
         
            +
                def _parse_symbols(text):
         
     | 
| 700 | 
         
            +
                    TAG_NUM = "00" # "00" => default channels , "$0" => testing channel
         
     | 
| 701 | 
         
            +
                    TAG_S1,TAG_S2,TAG_P1,TAG_P2,TAG_EN,TAG_KO,TAG_RU,TAG_TH = "$1" ,"$2" ,"$3" ,"$4" ,"$5" ,"$6" ,"$7","$8"
         
     | 
| 702 | 
         
            +
                    TAG_BASE = re.compile(fr'(([【《((“‘"\']*[LANGUAGE]+[\W\s]*)+)')
         
     | 
| 703 | 
         
            +
                    # Get custom language filter
         
     | 
| 704 | 
         
            +
                    filters = LangSegment.Langfilters
         
     | 
| 705 | 
         
            +
                    filters = filters if filters is not None else ""
         
     | 
| 706 | 
         
            +
                    # =======================================================================================================
         
     | 
| 707 | 
         
            +
                    # Experimental: Other language support.Thử nghiệm: Hỗ trợ ngôn ngữ khác.Expérimental : prise en charge d’autres langues.
         
     | 
| 708 | 
         
            +
                    # 相关语言字符如有缺失,熟悉相关语言的朋友,可以提交把缺失的发音符��补全。
         
     | 
| 709 | 
         
            +
                    # If relevant language characters are missing, friends who are familiar with the relevant languages can submit a submission to complete the missing pronunciation symbols.
         
     | 
| 710 | 
         
            +
                    # S'il manque des caractères linguistiques pertinents, les amis qui connaissent les langues concernées peuvent soumettre une soumission pour compléter les symboles de prononciation manquants.
         
     | 
| 711 | 
         
            +
                    # Nếu thiếu ký tự ngôn ngữ liên quan, những người bạn quen thuộc với ngôn ngữ liên quan có thể gửi bài để hoàn thành các ký hiệu phát âm còn thiếu.
         
     | 
| 712 | 
         
            +
                    # -------------------------------------------------------------------------------------------------------
         
     | 
| 713 | 
         
            +
                    # Preview feature, other language support
         
     | 
| 714 | 
         
            +
                    enablePreview = LangSegment.EnablePreview
         
     | 
| 715 | 
         
            +
                    if "fr" in filters or \
         
     | 
| 716 | 
         
            +
                       "vi" in filters:enablePreview = True
         
     | 
| 717 | 
         
            +
                    LangSegment.EnablePreview = enablePreview
         
     | 
| 718 | 
         
            +
                    # 实验性:法语字符支持。Prise en charge des caractères français
         
     | 
| 719 | 
         
            +
                    RE_FR = "" if not enablePreview else "àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ"
         
     | 
| 720 | 
         
            +
                    # 实验性:越南语字符支持。Hỗ trợ ký tự tiếng Việt
         
     | 
| 721 | 
         
            +
                    RE_VI = "" if not enablePreview else "đơưăáàảãạắằẳẵặấầẩẫậéèẻẽẹếềểễệíìỉĩịóòỏõọốồổỗộớờởỡợúùủũụứừửữựôâêơưỷỹ"
         
     | 
| 722 | 
         
            +
                    # -------------------------------------------------------------------------------------------------------
         
     | 
| 723 | 
         
            +
                    # Basic options:
         
     | 
| 724 | 
         
            +
                    process_list = [
         
     | 
| 725 | 
         
            +
                        (  TAG_S1  , re.compile(LangSegment.SYMBOLS_PATTERN) , LangSegment._process_symbol  ),               # Symbol Tag
         
     | 
| 726 | 
         
            +
                        (  TAG_KO  , re.compile(re.sub(r'LANGUAGE',f'\uac00-\ud7a3',TAG_BASE.pattern))    , LangSegment._process_korean  ),              # Korean words
         
     | 
| 727 | 
         
            +
                        (  TAG_TH  , re.compile(re.sub(r'LANGUAGE',f'\u0E00-\u0E7F',TAG_BASE.pattern))    , LangSegment._process_Thai ),                 # Thai words support.
         
     | 
| 728 | 
         
            +
                        (  TAG_RU  , re.compile(re.sub(r'LANGUAGE',f'А-Яа-яЁё',TAG_BASE.pattern))         , LangSegment._process_Russian ),              # Russian words support.
         
     | 
| 729 | 
         
            +
                        (  TAG_NUM , re.compile(r'(\W*\d+\W+\d*\W*\d*)')        , LangSegment._process_number  ),  # Number words, Universal in all languages, Ignore it.
         
     | 
| 730 | 
         
            +
                        (  TAG_EN  , re.compile(re.sub(r'LANGUAGE',f'a-zA-Z{RE_FR}{RE_VI}',TAG_BASE.pattern))    , LangSegment._process_english ),       # English words + Other language support.
         
     | 
| 731 | 
         
            +
                        (  TAG_P1  , re.compile(r'(["\'])(.*?)(\1)')         , LangSegment._process_quotes  ),     # Regular quotes
         
     | 
| 732 | 
         
            +
                        (  TAG_P2  , re.compile(r'([\n]*[【《((“‘])([^【《((“‘’”))》】]{3,})([’”))》】][\W\s]*[\n]{,1})')   , LangSegment._process_quotes  ),  # Special quotes, There are left and right.
         
     | 
| 733 | 
         
            +
                    ]
         
     | 
| 734 | 
         
            +
                    # Extended options: Default False
         
     | 
| 735 | 
         
            +
                    if LangSegment.keepPinyin == True:process_list.insert(1 , 
         
     | 
| 736 | 
         
            +
                        (  TAG_S2  , re.compile(r'([\(({](?:\s*\w*\d\w*\s*)+[})\)])') , LangSegment._process_pinyin  ),     # Chinese Pinyin Tag. 
         
     | 
| 737 | 
         
            +
                    ) 
         
     | 
| 738 | 
         
            +
                    # -------------------------------------------------------------------------------------------------------
         
     | 
| 739 | 
         
            +
                    words = []
         
     | 
| 740 | 
         
            +
                    lines = re.findall(r'.*\n*', re.sub(LangSegment.PARSE_TAG, '' ,text))
         
     | 
| 741 | 
         
            +
                    for index , text in enumerate(lines):
         
     | 
| 742 | 
         
            +
                        if len(text.strip()) == 0:continue
         
     | 
| 743 | 
         
            +
                        LangSegment._lang_eos = False
         
     | 
| 744 | 
         
            +
                        LangSegment._text_cache = {}
         
     | 
| 745 | 
         
            +
                        for item in process_list:
         
     | 
| 746 | 
         
            +
                            text = LangSegment._pattern_symbols(item , text)
         
     | 
| 747 | 
         
            +
                        cur_word = LangSegment._process_tags([] , text , True)
         
     | 
| 748 | 
         
            +
                        if len(cur_word) == 0:continue
         
     | 
| 749 | 
         
            +
                        cur_data = cur_word[0] if len(cur_word) > 0 else None
         
     | 
| 750 | 
         
            +
                        pre_data = words[-1] if len(words) > 0 else None
         
     | 
| 751 | 
         
            +
                        if cur_data and pre_data and cur_data["lang"] == pre_data["lang"] \
         
     | 
| 752 | 
         
            +
                            and cur_data["symbol"] == False and pre_data["symbol"] :
         
     | 
| 753 | 
         
            +
                            cur_data["text"] = f'{pre_data["text"]}{cur_data["text"]}'
         
     | 
| 754 | 
         
            +
                            words.pop()
         
     | 
| 755 | 
         
            +
                        words += cur_word
         
     | 
| 756 | 
         
            +
                    if LangSegment.isLangMerge == True:words = LangSegment._merge_results(words)
         
     | 
| 757 | 
         
            +
                    lang_count = LangSegment._lang_count
         
     | 
| 758 | 
         
            +
                    if lang_count and len(lang_count) > 0:
         
     | 
| 759 | 
         
            +
                        lang_count = dict(sorted(lang_count.items(), key=lambda x: x[1], reverse=True))
         
     | 
| 760 | 
         
            +
                        lang_count = list(lang_count.items())
         
     | 
| 761 | 
         
            +
                        LangSegment._lang_count = lang_count
         
     | 
| 762 | 
         
            +
                    return words
         
     | 
| 763 | 
         
            +
                
         
     | 
| 764 | 
         
            +
                @staticmethod
         
     | 
| 765 | 
         
            +
                def setfilters(filters):
         
     | 
| 766 | 
         
            +
                    # 当过滤器更改时,清除缓存
         
     | 
| 767 | 
         
            +
                    # 필터가 변경되면 캐시를 지웁니다.
         
     | 
| 768 | 
         
            +
                    # フィルタが変更されると、キャッシュがクリアされます
         
     | 
| 769 | 
         
            +
                    # When the filter changes, clear the cache
         
     | 
| 770 | 
         
            +
                    if LangSegment.Langfilters != filters:
         
     | 
| 771 | 
         
            +
                        LangSegment._clears()
         
     | 
| 772 | 
         
            +
                        LangSegment.Langfilters = filters
         
     | 
| 773 | 
         
            +
                    pass
         
     | 
| 774 | 
         
            +
                   
         
     | 
| 775 | 
         
            +
                @staticmethod     
         
     | 
| 776 | 
         
            +
                def getfilters():
         
     | 
| 777 | 
         
            +
                    return LangSegment.Langfilters
         
     | 
| 778 | 
         
            +
                
         
     | 
| 779 | 
         
            +
                @staticmethod 
         
     | 
| 780 | 
         
            +
                def setPriorityThreshold(threshold:float):
         
     | 
| 781 | 
         
            +
                    LangSegment.LangPriorityThreshold = threshold
         
     | 
| 782 | 
         
            +
                    pass
         
     | 
| 783 | 
         
            +
                
         
     | 
| 784 | 
         
            +
                @staticmethod 
         
     | 
| 785 | 
         
            +
                def getPriorityThreshold():
         
     | 
| 786 | 
         
            +
                    return LangSegment.LangPriorityThreshold
         
     | 
| 787 | 
         
            +
                
         
     | 
| 788 | 
         
            +
                @staticmethod
         
     | 
| 789 | 
         
            +
                def getCounts():
         
     | 
| 790 | 
         
            +
                    lang_count = LangSegment._lang_count
         
     | 
| 791 | 
         
            +
                    if lang_count is not None:return lang_count
         
     | 
| 792 | 
         
            +
                    text_langs = LangSegment._text_langs
         
     | 
| 793 | 
         
            +
                    if text_langs is None or len(text_langs) == 0:return [("zh",0)]
         
     | 
| 794 | 
         
            +
                    lang_counts = defaultdict(int)
         
     | 
| 795 | 
         
            +
                    for d in text_langs:lang_counts[d['lang']] += int(len(d['text'])*2) if d['lang'] == "zh" else len(d['text'])
         
     | 
| 796 | 
         
            +
                    lang_counts = dict(sorted(lang_counts.items(), key=lambda x: x[1], reverse=True))
         
     | 
| 797 | 
         
            +
                    lang_counts = list(lang_counts.items())
         
     | 
| 798 | 
         
            +
                    LangSegment._lang_count = lang_counts
         
     | 
| 799 | 
         
            +
                    return lang_counts
         
     | 
| 800 | 
         
            +
                
         
     | 
| 801 | 
         
            +
                @staticmethod
         
     | 
| 802 | 
         
            +
                def getTexts(text:str):
         
     | 
| 803 | 
         
            +
                    if text is None or len(text.strip()) == 0:
         
     | 
| 804 | 
         
            +
                        LangSegment._clears()
         
     | 
| 805 | 
         
            +
                        return []
         
     | 
| 806 | 
         
            +
                    # lasts
         
     | 
| 807 | 
         
            +
                    text_langs = LangSegment._text_langs
         
     | 
| 808 | 
         
            +
                    if LangSegment._text_lasts == text and text_langs is not None:return text_langs 
         
     | 
| 809 | 
         
            +
                    # parse
         
     | 
| 810 | 
         
            +
                    LangSegment._text_waits = []
         
     | 
| 811 | 
         
            +
                    LangSegment._lang_count = None
         
     | 
| 812 | 
         
            +
                    LangSegment._text_lasts = text
         
     | 
| 813 | 
         
            +
                    text = LangSegment._parse_symbols(text)
         
     | 
| 814 | 
         
            +
                    LangSegment._text_langs = text
         
     | 
| 815 | 
         
            +
                    return text
         
     | 
| 816 | 
         
            +
                
         
     | 
| 817 | 
         
            +
                @staticmethod
         
     | 
| 818 | 
         
            +
                def classify(text:str):
         
     | 
| 819 | 
         
            +
                    return LangSegment.getTexts(text)
         
     | 
| 820 | 
         
            +
             
     | 
| 821 | 
         
            +
             
     | 
| 822 | 
         
            +
            def setLangMerge(value:bool):
         
     | 
| 823 | 
         
            +
                """是否优化合并结果
         
     | 
| 824 | 
         
            +
                """
         
     | 
| 825 | 
         
            +
                LangSegment.isLangMerge = value
         
     | 
| 826 | 
         
            +
                pass
         
     | 
| 827 | 
         
            +
             
     | 
| 828 | 
         
            +
            def getLangMerge():
         
     | 
| 829 | 
         
            +
                """是否优化合并结果
         
     | 
| 830 | 
         
            +
                """
         
     | 
| 831 | 
         
            +
                return LangSegment.isLangMerge
         
     | 
| 832 | 
         
            +
             
     | 
| 833 | 
         
            +
             
     | 
| 834 | 
         
            +
            def setfilters(filters):
         
     | 
| 835 | 
         
            +
                """
         
     | 
| 836 | 
         
            +
                功能:语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
         
     | 
| 837 | 
         
            +
                기능: 언어 필터 그룹 기능, 예약된 언어를 지정할 수 있습니다. 필터 그룹에 없는 언어는 지워집니다. TTS 텍스트에서 지원하는 언어를 원하는 대로 일치시킬 수 있습니다.
         
     | 
| 838 | 
         
            +
                機能:言語フィルターグループ機能で、予約言語を指定できます。フィルターグループに含まれていない言語はクリアされます。TTS音声合成がサポートする言語を自由に組み合わせることができます。
         
     | 
| 839 | 
         
            +
                Function: Language filter group function, you can specify reserved languages. \n
         
     | 
| 840 | 
         
            +
                Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.\n
         
     | 
| 841 | 
         
            +
                Args:
         
     | 
| 842 | 
         
            +
                    filters (list): ["zh", "en", "ja", "ko"] 排名越前,优先级越高
         
     | 
| 843 | 
         
            +
                """
         
     | 
| 844 | 
         
            +
                LangSegment.setfilters(filters)
         
     | 
| 845 | 
         
            +
                pass
         
     | 
| 846 | 
         
            +
             
     | 
| 847 | 
         
            +
            def getfilters():
         
     | 
| 848 | 
         
            +
                """
         
     | 
| 849 | 
         
            +
                功能:语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
         
     | 
| 850 | 
         
            +
                기능: 언어 필터 그룹 기능, 예약된 언어를 지정할 수 있습니다. 필터 그룹에 없는 언어는 지워집니다. TTS 텍스트에서 지원하는 언어를 원하는 대로 일치시킬 수 있습니다.
         
     | 
| 851 | 
         
            +
                機能:言語フィルターグループ機能で、予約言語を指定できます。フィルターグループに含まれていない言語はクリアされます。TTS音声合成がサポートする言語を自由に組み合わせることができます。
         
     | 
| 852 | 
         
            +
                Function: Language filter group function, you can specify reserved languages. \n
         
     | 
| 853 | 
         
            +
                Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.\n
         
     | 
| 854 | 
         
            +
                Args:
         
     | 
| 855 | 
         
            +
                    filters (list): ["zh", "en", "ja", "ko"] 排名越前,优先级越高
         
     | 
| 856 | 
         
            +
                """
         
     | 
| 857 | 
         
            +
                return LangSegment.getfilters()
         
     | 
| 858 | 
         
            +
             
     | 
| 859 | 
         
            +
            # # @Deprecated:Use shorter setfilters
         
     | 
| 860 | 
         
            +
            # def setLangfilters(filters):
         
     | 
| 861 | 
         
            +
            #     """
         
     | 
| 862 | 
         
            +
            #     >0.1.9废除:使用更简短的setfilters
         
     | 
| 863 | 
         
            +
            #     """
         
     | 
| 864 | 
         
            +
            #     setfilters(filters)
         
     | 
| 865 | 
         
            +
            # # @Deprecated:Use shorter getfilters
         
     | 
| 866 | 
         
            +
            # def getLangfilters():
         
     | 
| 867 | 
         
            +
            #     """
         
     | 
| 868 | 
         
            +
            #     >0.1.9废除:使用更简短的getfilters
         
     | 
| 869 | 
         
            +
            #     """
         
     | 
| 870 | 
         
            +
            #     return getfilters()
         
     | 
| 871 | 
         
            +
             
     | 
| 872 | 
         
            +
             
     | 
| 873 | 
         
            +
            def setKeepPinyin(value:bool):
         
     | 
| 874 | 
         
            +
                """
         
     | 
| 875 | 
         
            +
                可选保留:支持中文数字拼音格式,更方便前端实现拼音音素修改和推理,默认关闭 False 。\n
         
     | 
| 876 | 
         
            +
                开启后 True ,括号内的数字拼音格式均保留,并识别输出为:"zh"中文。
         
     | 
| 877 | 
         
            +
                """
         
     | 
| 878 | 
         
            +
                LangSegment.keepPinyin = value
         
     | 
| 879 | 
         
            +
                pass
         
     | 
| 880 | 
         
            +
             
     | 
| 881 | 
         
            +
            def getKeepPinyin():
         
     | 
| 882 | 
         
            +
                """
         
     | 
| 883 | 
         
            +
                可选保留:支持中文数字拼音格式,更方便前端实现拼音音素修改和推理,默认关闭 False 。\n
         
     | 
| 884 | 
         
            +
                开启后 True ,括号内的数字拼音格式均保留,并识别输出为:"zh"中文。
         
     | 
| 885 | 
         
            +
                """
         
     | 
| 886 | 
         
            +
                return LangSegment.keepPinyin
         
     | 
| 887 | 
         
            +
             
     | 
| 888 | 
         
            +
            def setEnablePreview(value:bool):
         
     | 
| 889 | 
         
            +
                """
         
     | 
| 890 | 
         
            +
                启用预览版功能(默认关闭)
         
     | 
| 891 | 
         
            +
                Enable preview functionality (off by default)
         
     | 
| 892 | 
         
            +
                Args:
         
     | 
| 893 | 
         
            +
                    value (bool): True=开启, False=��闭
         
     | 
| 894 | 
         
            +
                """
         
     | 
| 895 | 
         
            +
                LangSegment.EnablePreview = (value == True)
         
     | 
| 896 | 
         
            +
                pass
         
     | 
| 897 | 
         
            +
             
     | 
| 898 | 
         
            +
            def getEnablePreview():
         
     | 
| 899 | 
         
            +
                """
         
     | 
| 900 | 
         
            +
                启用预览版功能(默认关闭)
         
     | 
| 901 | 
         
            +
                Enable preview functionality (off by default)
         
     | 
| 902 | 
         
            +
                Args:
         
     | 
| 903 | 
         
            +
                    value (bool): True=开启, False=关闭
         
     | 
| 904 | 
         
            +
                """
         
     | 
| 905 | 
         
            +
                return LangSegment.EnablePreview == True
         
     | 
| 906 | 
         
            +
             
     | 
| 907 | 
         
            +
            def setPriorityThreshold(threshold:float):
         
     | 
| 908 | 
         
            +
                """
         
     | 
| 909 | 
         
            +
                中/日语言优先级阀值(评分范围为 0 ~ 1):评分低于设定阀值 <0.89 时,启用 filters 中的优先级。\n
         
     | 
| 910 | 
         
            +
                中国語/日本語の優先度しきい値(スコア範囲0〜1):スコアが設定されたしきい値<0.89未満の場合、フィルターの優先度が有効になります。\n
         
     | 
| 911 | 
         
            +
                중/일본어 우선 순위 임계값(점수 범위 0-1): 점수가 설정된 임계값 <0.89보다 낮을 때 필터에서 우선 순위를 활성화합니다.
         
     | 
| 912 | 
         
            +
                Chinese and Japanese language priority threshold (score range is 0 ~ 1): The default threshold is 0.89.  \n
         
     | 
| 913 | 
         
            +
                Only the common characters between Chinese and Japanese are processed with confidence and priority. \n
         
     | 
| 914 | 
         
            +
                Args:
         
     | 
| 915 | 
         
            +
                    threshold:float (score range is 0 ~ 1)
         
     | 
| 916 | 
         
            +
                """
         
     | 
| 917 | 
         
            +
                LangSegment.setPriorityThreshold(threshold)
         
     | 
| 918 | 
         
            +
                pass
         
     | 
| 919 | 
         
            +
             
     | 
| 920 | 
         
            +
            def getPriorityThreshold():
         
     | 
| 921 | 
         
            +
                """
         
     | 
| 922 | 
         
            +
                中/日语言优先级阀值(评分范围为 0 ~ 1):评分低于设定阀值 <0.89 时,启用 filters 中的优先级。\n
         
     | 
| 923 | 
         
            +
                中国語/日本語の優先度しきい値(スコア範囲0〜1):スコアが設定されたしきい値<0.89未満の場合、フィルターの優先度が有効になります。\n
         
     | 
| 924 | 
         
            +
                중/일본어 우선 순위 임계값(점수 범위 0-1): 점수가 설정된 임계값 <0.89보다 낮을 때 필터에서 우선 순위를 활성화합니다.
         
     | 
| 925 | 
         
            +
                Chinese and Japanese language priority threshold (score range is 0 ~ 1): The default threshold is 0.89.  \n
         
     | 
| 926 | 
         
            +
                Only the common characters between Chinese and Japanese are processed with confidence and priority. \n
         
     | 
| 927 | 
         
            +
                Args:
         
     | 
| 928 | 
         
            +
                    threshold:float (score range is 0 ~ 1)
         
     | 
| 929 | 
         
            +
                """
         
     | 
| 930 | 
         
            +
                return LangSegment.getPriorityThreshold()
         
     | 
| 931 | 
         
            +
                
         
     | 
| 932 | 
         
            +
            def getTexts(text:str):
         
     | 
| 933 | 
         
            +
                """
         
     | 
| 934 | 
         
            +
                功能:对输入的文本进行多语种分词\n 
         
     | 
| 935 | 
         
            +
                기능: 입력 텍스트의 다국어 분할 \n
         
     | 
| 936 | 
         
            +
                機能:入力されたテキストの多言語セグメンテーション\n
         
     | 
| 937 | 
         
            +
                Feature: Tokenizing multilingual text input.\n 
         
     | 
| 938 | 
         
            +
                参数-Args:
         
     | 
| 939 | 
         
            +
                    text (str): Text content,文本内容\n
         
     | 
| 940 | 
         
            +
                返回-Returns:
         
     | 
| 941 | 
         
            +
                    list: 示例结果:[{'lang':'zh','text':'?'},...]\n
         
     | 
| 942 | 
         
            +
                    lang=语种 , text=内容\n
         
     | 
| 943 | 
         
            +
                """
         
     | 
| 944 | 
         
            +
                return LangSegment.getTexts(text)
         
     | 
| 945 | 
         
            +
             
     | 
| 946 | 
         
            +
            def getCounts():
         
     | 
| 947 | 
         
            +
                """
         
     | 
| 948 | 
         
            +
                功能:分词结果统计,按语种字数降序,用于确定其主要语言\n 
         
     | 
| 949 | 
         
            +
                기능: 주요 언어를 결정하는 데 사용되는 언어별 단어 수 내림차순으로 단어 분할 결과의 통계 \n
         
     | 
| 950 | 
         
            +
                機能:主な言語を決定するために使用される、言語の単語数の降順による単語分割結果の統計\n
         
     | 
| 951 | 
         
            +
                Function: Tokenizing multilingual text input.\n 
         
     | 
| 952 | 
         
            +
                返回-Returns:
         
     | 
| 953 | 
         
            +
                    list: 示例结果:[('zh', 5), ('ja', 2), ('en', 1)] = [(语种,字数含标点)]\n
         
     | 
| 954 | 
         
            +
                """
         
     | 
| 955 | 
         
            +
                return LangSegment.getCounts()
         
     | 
| 956 | 
         
            +
                
         
     | 
| 957 | 
         
            +
            def classify(text:str):
         
     | 
| 958 | 
         
            +
                """
         
     | 
| 959 | 
         
            +
                功能:兼容接口实现
         
     | 
| 960 | 
         
            +
                Function: Compatible interface implementation
         
     | 
| 961 | 
         
            +
                """
         
     | 
| 962 | 
         
            +
                return LangSegment.classify(text)
         
     | 
| 963 | 
         
            +
              
         
     | 
| 964 | 
         
            +
            def printList(langlist):
         
     | 
| 965 | 
         
            +
                """
         
     | 
| 966 | 
         
            +
                功能:打印数组结果
         
     | 
| 967 | 
         
            +
                기능: 어레이 결과 인쇄
         
     | 
| 968 | 
         
            +
                機能:配列結果を印刷
         
     | 
| 969 | 
         
            +
                Function: Print array results
         
     | 
| 970 | 
         
            +
                """
         
     | 
| 971 | 
         
            +
                print("\n===================【打印结果】===================")
         
     | 
| 972 | 
         
            +
                if langlist is None or len(langlist) == 0:
         
     | 
| 973 | 
         
            +
                    print("无内容结果,No content result")
         
     | 
| 974 | 
         
            +
                    return
         
     | 
| 975 | 
         
            +
                for line in langlist:
         
     | 
| 976 | 
         
            +
                    print(line)
         
     | 
| 977 | 
         
            +
                pass  
         
     | 
| 978 | 
         
            +
                
         
     | 
| 979 | 
         
            +
             
     | 
| 980 | 
         
            +
             
     | 
| 981 | 
         
            +
            def main():
         
     | 
| 982 | 
         
            +
                
         
     | 
| 983 | 
         
            +
                # -----------------------------------
         
     | 
| 984 | 
         
            +
                # 更新日志:新版本分词更加精准。
         
     | 
| 985 | 
         
            +
                # Changelog: The new version of the word segmentation is more accurate.
         
     | 
| 986 | 
         
            +
                # チェンジログ:新しいバージョンの単語セグメンテーションはより正確です。
         
     | 
| 987 | 
         
            +
                # Changelog: 분할이라는 단어의 새로운 버전이 더 정확합니다.
         
     | 
| 988 | 
         
            +
                # -----------------------------------
         
     | 
| 989 | 
         
            +
                
         
     | 
| 990 | 
         
            +
                # 输入示例1:(包含日文,中文)Input Example 1: (including Japanese, Chinese)
         
     | 
| 991 | 
         
            +
                # text = "“昨日は雨が降った,音楽、映画。。。”你今天学习日语了吗?春は桜の季節です。语种分词是语音合成必不可少的环节。言語分詞は音声合成に欠かせない環節である!"
         
     | 
| 992 | 
         
            +
                
         
     | 
| 993 | 
         
            +
                # 输入示例2:(包含日文,中文)Input Example 1: (including Japanese, Chinese)
         
     | 
| 994 | 
         
            +
                # text = "欢迎来玩。東京,は日本の首都です。欢迎来玩.  太好了!"
         
     | 
| 995 | 
         
            +
                
         
     | 
| 996 | 
         
            +
                # 输入示例3:(包含日文,中文)Input Example 1: (including Japanese, Chinese)
         
     | 
| 997 | 
         
            +
                # text = "明日、私たちは海辺にバカンスに行きます。你会说日语吗:“中国語、話せますか” 你的日语真好啊!"
         
     | 
| 998 | 
         
            +
                
         
     | 
| 999 | 
         
            +
                
         
     | 
| 1000 | 
         
            +
                # 输入示例4:(包含日文,中文,韩语,英文)Input Example 4: (including Japanese, Chinese, Korean, English)
         
     | 
| 1001 | 
         
            +
                # text = "你的名字叫<ja>佐々木?<ja>吗?韩语中的안녕 오빠读什么呢?あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型和三款Apple Watch等一系列新品,这次的iPad Air采用了LCD屏幕" 
         
     | 
| 1002 | 
         
            +
                
         
     | 
| 1003 | 
         
            +
                
         
     | 
| 1004 | 
         
            +
                # 试验性支持:"fr"法语 , "vi"越南语 , "ru"俄语 , "th"泰语。Experimental: Other language support.
         
     | 
| 1005 | 
         
            +
                LangSegment.setfilters(["fr", "vi" , "ja", "zh", "ko", "en" , "ru" , "th"])
         
     | 
| 1006 | 
         
            +
                text = """
         
     | 
| 1007 | 
         
            +
            我喜欢在雨天里听音乐。
         
     | 
| 1008 | 
         
            +
            I enjoy listening to music on rainy days.
         
     | 
| 1009 | 
         
            +
            雨の日に音楽を聴くのが好きです。
         
     | 
| 1010 | 
         
            +
            비 오는 날에 음악을 듣는 것을 즐깁니다。
         
     | 
| 1011 | 
         
            +
            J'aime écouter de la musique les jours de pluie.
         
     | 
| 1012 | 
         
            +
            Tôi thích nghe nhạc vào những ngày mưa.
         
     | 
| 1013 | 
         
            +
            Мне нравится слушать музыку в дождливую погоду.
         
     | 
| 1014 | 
         
            +
            ฉันชอบฟังเพลงในวันที่ฝนตก
         
     | 
| 1015 | 
         
            +
            """
         
     | 
| 1016 | 
         
            +
             
     | 
| 1017 | 
         
            +
             
     | 
| 1018 | 
         
            +
             
     | 
| 1019 | 
         
            +
                # 进行分词:(接入TTS项目仅需一行代码调用)Segmentation: (Only one line of code is required to access the TTS project)
         
     | 
| 1020 | 
         
            +
                langlist = LangSegment.getTexts(text)
         
     | 
| 1021 | 
         
            +
                printList(langlist)
         
     | 
| 1022 | 
         
            +
                
         
     | 
| 1023 | 
         
            +
                
         
     | 
| 1024 | 
         
            +
                # 语种统计:Language statistics:
         
     | 
| 1025 | 
         
            +
                print("\n===================【语种统计】===================")
         
     | 
| 1026 | 
         
            +
                # 获取所有语种数组结果,根据内容字数降序排列
         
     | 
| 1027 | 
         
            +
                # Get the array results in all languages, sorted in descending order according to the number of content words
         
     | 
| 1028 | 
         
            +
                langCounts = LangSegment.getCounts()
         
     | 
| 1029 | 
         
            +
                print(langCounts , "\n")
         
     | 
| 1030 | 
         
            +
                
         
     | 
| 1031 | 
         
            +
                # 根据结果获取内容的主要语种 (语言,字数含标点)
         
     | 
| 1032 | 
         
            +
                # Get the main language of content based on the results (language, word count including punctuation)
         
     | 
| 1033 | 
         
            +
                lang , count = langCounts[0] 
         
     | 
| 1034 | 
         
            +
                print(f"输入内容的主要语言为 = {lang} ,字数 = {count}")
         
     | 
| 1035 | 
         
            +
                print("==================================================\n")
         
     | 
| 1036 | 
         
            +
                
         
     | 
| 1037 | 
         
            +
                
         
     | 
| 1038 | 
         
            +
                # 分词输出:lang=语言,text=内容。Word output: lang = language, text = content
         
     | 
| 1039 | 
         
            +
                # ===================【打印结果】===================
         
     | 
| 1040 | 
         
            +
                # {'lang': 'zh', 'text': '你的名字叫'}
         
     | 
| 1041 | 
         
            +
                # {'lang': 'ja', 'text': '佐々木?'}
         
     | 
| 1042 | 
         
            +
                # {'lang': 'zh', 'text': '吗?韩语中的'}
         
     | 
| 1043 | 
         
            +
                # {'lang': 'ko', 'text': '안녕 오빠'}
         
     | 
| 1044 | 
         
            +
                # {'lang': 'zh', 'text': '读什么呢?'}
         
     | 
| 1045 | 
         
            +
                # {'lang': 'ja', 'text': 'あなたの体育の先生は誰ですか?'}
         
     | 
| 1046 | 
         
            +
                # {'lang': 'zh', 'text': ' 此次发布会带来了四款'}
         
     | 
| 1047 | 
         
            +
                # {'lang': 'en', 'text': 'i Phone  '}
         
     | 
| 1048 | 
         
            +
                # {'lang': 'zh', 'text': '15系列机型和三款'}
         
     | 
| 1049 | 
         
            +
                # {'lang': 'en', 'text': 'Apple Watch '}
         
     | 
| 1050 | 
         
            +
                # {'lang': 'zh', 'text': '等一系列新品,这次的'}
         
     | 
| 1051 | 
         
            +
                # {'lang': 'en', 'text': 'i Pad Air '}
         
     | 
| 1052 | 
         
            +
                # {'lang': 'zh', 'text': '采用了'}
         
     | 
| 1053 | 
         
            +
                # {'lang': 'en', 'text': 'L C D '}
         
     | 
| 1054 | 
         
            +
                # {'lang': 'zh', 'text': '屏幕'}
         
     | 
| 1055 | 
         
            +
                # ===================【语种统计】===================
         
     | 
| 1056 | 
         
            +
                
         
     | 
| 1057 | 
         
            +
                # ===================【语种统计】===================
         
     | 
| 1058 | 
         
            +
                # [('zh', 51), ('ja', 19), ('en', 18), ('ko', 5)]
         
     | 
| 1059 | 
         
            +
             
     | 
| 1060 | 
         
            +
                # 输入内容的主要语言为 = zh ,字数 = 51
         
     | 
| 1061 | 
         
            +
                # ==================================================
         
     | 
| 1062 | 
         
            +
                # The main language of the input content is = zh, word count = 51
         
     | 
| 1063 | 
         
            +
                
         
     | 
| 1064 | 
         
            +
                
         
     | 
| 1065 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 1066 | 
         
            +
                main()
         
     | 
| 1067 | 
         
            +
             
     | 
| 1068 | 
         
            +
                
         
     | 
    	
        LangSegment/__init__.py
    ADDED
    
    | 
         @@ -0,0 +1,9 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            from .LangSegment import LangSegment,getTexts,classify,getCounts,printList,setfilters,getfilters,setPriorityThreshold,getPriorityThreshold,setEnablePreview,getEnablePreview,setKeepPinyin,getKeepPinyin,setLangMerge,getLangMerge
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            # release
         
     | 
| 5 | 
         
            +
            __version__ = '0.3.5'
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
             
     | 
| 8 | 
         
            +
            # develop
         
     | 
| 9 | 
         
            +
            __develop__ = 'dev-0.0.1'
         
     | 
    	
        LangSegment/utils/__init__.py
    ADDED
    
    | 
         
            File without changes
         
     | 
    	
        LangSegment/utils/num.py
    ADDED
    
    | 
         @@ -0,0 +1,327 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
         
     | 
| 2 | 
         
            +
            #
         
     | 
| 3 | 
         
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         
     | 
| 4 | 
         
            +
            # you may not use this file except in compliance with the License.
         
     | 
| 5 | 
         
            +
            # You may obtain a copy of the License at
         
     | 
| 6 | 
         
            +
            #
         
     | 
| 7 | 
         
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         
     | 
| 8 | 
         
            +
            #
         
     | 
| 9 | 
         
            +
            # Unless required by applicable law or agreed to in writing, software
         
     | 
| 10 | 
         
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         
     | 
| 11 | 
         
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         
     | 
| 12 | 
         
            +
            # See the License for the specific language governing permissions and
         
     | 
| 13 | 
         
            +
            # limitations under the License.
         
     | 
| 14 | 
         
            +
            # Digital processing from GPT_SoVITS num.py (thanks)
         
     | 
| 15 | 
         
            +
            """
         
     | 
| 16 | 
         
            +
            Rules to verbalize numbers into Chinese characters.
         
     | 
| 17 | 
         
            +
            https://zh.wikipedia.org/wiki/中文数字#現代中文
         
     | 
| 18 | 
         
            +
            """
         
     | 
| 19 | 
         
            +
             
     | 
| 20 | 
         
            +
            import re
         
     | 
| 21 | 
         
            +
            from collections import OrderedDict
         
     | 
| 22 | 
         
            +
            from typing import List
         
     | 
| 23 | 
         
            +
             
     | 
| 24 | 
         
            +
            DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
         
     | 
| 25 | 
         
            +
            UNITS = OrderedDict({
         
     | 
| 26 | 
         
            +
                1: '十',
         
     | 
| 27 | 
         
            +
                2: '百',
         
     | 
| 28 | 
         
            +
                3: '千',
         
     | 
| 29 | 
         
            +
                4: '万',
         
     | 
| 30 | 
         
            +
                8: '亿',
         
     | 
| 31 | 
         
            +
            })
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
            COM_QUANTIFIERS = '(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            # 分数表达式
         
     | 
| 36 | 
         
            +
            RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
             
     | 
| 39 | 
         
            +
            def replace_frac(match) -> str:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Args:
         
     | 
| 42 | 
         
            +
                    match (re.Match)
         
     | 
| 43 | 
         
            +
                Returns:
         
     | 
| 44 | 
         
            +
                    str
         
     | 
| 45 | 
         
            +
                """
         
     | 
| 46 | 
         
            +
                sign = match.group(1)
         
     | 
| 47 | 
         
            +
                nominator = match.group(2)
         
     | 
| 48 | 
         
            +
                denominator = match.group(3)
         
     | 
| 49 | 
         
            +
                sign: str = "负" if sign else ""
         
     | 
| 50 | 
         
            +
                nominator: str = num2str(nominator)
         
     | 
| 51 | 
         
            +
                denominator: str = num2str(denominator)
         
     | 
| 52 | 
         
            +
                result = f"{sign}{denominator}分之{nominator}"
         
     | 
| 53 | 
         
            +
                return result
         
     | 
| 54 | 
         
            +
             
     | 
| 55 | 
         
            +
             
     | 
| 56 | 
         
            +
            # 百分数表达式
         
     | 
| 57 | 
         
            +
            RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
         
     | 
| 58 | 
         
            +
             
     | 
| 59 | 
         
            +
             
     | 
| 60 | 
         
            +
            def replace_percentage(match) -> str:
         
     | 
| 61 | 
         
            +
                """
         
     | 
| 62 | 
         
            +
                Args:
         
     | 
| 63 | 
         
            +
                    match (re.Match)
         
     | 
| 64 | 
         
            +
                Returns:
         
     | 
| 65 | 
         
            +
                    str
         
     | 
| 66 | 
         
            +
                """
         
     | 
| 67 | 
         
            +
                sign = match.group(1)
         
     | 
| 68 | 
         
            +
                percent = match.group(2)
         
     | 
| 69 | 
         
            +
                sign: str = "负" if sign else ""
         
     | 
| 70 | 
         
            +
                percent: str = num2str(percent)
         
     | 
| 71 | 
         
            +
                result = f"{sign}百分之{percent}"
         
     | 
| 72 | 
         
            +
                return result
         
     | 
| 73 | 
         
            +
             
     | 
| 74 | 
         
            +
             
     | 
| 75 | 
         
            +
            # 整数表达式
         
     | 
| 76 | 
         
            +
            # 带负号的整数 -10
         
     | 
| 77 | 
         
            +
            RE_INTEGER = re.compile(r'(-)' r'(\d+)')
         
     | 
| 78 | 
         
            +
             
     | 
| 79 | 
         
            +
             
     | 
| 80 | 
         
            +
            def replace_negative_num(match) -> str:
         
     | 
| 81 | 
         
            +
                """
         
     | 
| 82 | 
         
            +
                Args:
         
     | 
| 83 | 
         
            +
                    match (re.Match)
         
     | 
| 84 | 
         
            +
                Returns:
         
     | 
| 85 | 
         
            +
                    str
         
     | 
| 86 | 
         
            +
                """
         
     | 
| 87 | 
         
            +
                sign = match.group(1)
         
     | 
| 88 | 
         
            +
                number = match.group(2)
         
     | 
| 89 | 
         
            +
                sign: str = "负" if sign else ""
         
     | 
| 90 | 
         
            +
                number: str = num2str(number)
         
     | 
| 91 | 
         
            +
                result = f"{sign}{number}"
         
     | 
| 92 | 
         
            +
                return result
         
     | 
| 93 | 
         
            +
             
     | 
| 94 | 
         
            +
             
     | 
| 95 | 
         
            +
            # 编号-无符号整形
         
     | 
| 96 | 
         
            +
            # 00078
         
     | 
| 97 | 
         
            +
            RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
         
     | 
| 98 | 
         
            +
             
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
            def replace_default_num(match):
         
     | 
| 101 | 
         
            +
                """
         
     | 
| 102 | 
         
            +
                Args:
         
     | 
| 103 | 
         
            +
                    match (re.Match)
         
     | 
| 104 | 
         
            +
                Returns:
         
     | 
| 105 | 
         
            +
                    str
         
     | 
| 106 | 
         
            +
                """
         
     | 
| 107 | 
         
            +
                number = match.group(0)
         
     | 
| 108 | 
         
            +
                return verbalize_digit(number, alt_one=True)
         
     | 
| 109 | 
         
            +
             
     | 
| 110 | 
         
            +
             
     | 
| 111 | 
         
            +
            # 加减乘除
         
     | 
| 112 | 
         
            +
            # RE_ASMD = re.compile(
         
     | 
| 113 | 
         
            +
            #     r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
         
     | 
| 114 | 
         
            +
            RE_ASMD = re.compile(
         
     | 
| 115 | 
         
            +
                r'((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))')
         
     | 
| 116 | 
         
            +
             
     | 
| 117 | 
         
            +
            asmd_map = {
         
     | 
| 118 | 
         
            +
                '+': '加',
         
     | 
| 119 | 
         
            +
                '-': '减',
         
     | 
| 120 | 
         
            +
                '×': '乘',
         
     | 
| 121 | 
         
            +
                '÷': '除',
         
     | 
| 122 | 
         
            +
                '=': '等于'
         
     | 
| 123 | 
         
            +
            }
         
     | 
| 124 | 
         
            +
             
     | 
| 125 | 
         
            +
            def replace_asmd(match) -> str:
         
     | 
| 126 | 
         
            +
                """
         
     | 
| 127 | 
         
            +
                Args:
         
     | 
| 128 | 
         
            +
                    match (re.Match)
         
     | 
| 129 | 
         
            +
                Returns:
         
     | 
| 130 | 
         
            +
                    str
         
     | 
| 131 | 
         
            +
                """
         
     | 
| 132 | 
         
            +
                result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
         
     | 
| 133 | 
         
            +
                return result
         
     | 
| 134 | 
         
            +
             
     | 
| 135 | 
         
            +
             
     | 
| 136 | 
         
            +
            # 次方专项
         
     | 
| 137 | 
         
            +
            RE_POWER = re.compile(r'[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+')
         
     | 
| 138 | 
         
            +
             
     | 
| 139 | 
         
            +
            power_map = {
         
     | 
| 140 | 
         
            +
                '⁰': '0',
         
     | 
| 141 | 
         
            +
                '¹': '1',
         
     | 
| 142 | 
         
            +
                '²': '2',
         
     | 
| 143 | 
         
            +
                '³': '3',
         
     | 
| 144 | 
         
            +
                '⁴': '4',
         
     | 
| 145 | 
         
            +
                '⁵': '5',
         
     | 
| 146 | 
         
            +
                '⁶': '6',
         
     | 
| 147 | 
         
            +
                '⁷': '7',
         
     | 
| 148 | 
         
            +
                '⁸': '8',
         
     | 
| 149 | 
         
            +
                '⁹': '9',
         
     | 
| 150 | 
         
            +
                'ˣ': 'x',
         
     | 
| 151 | 
         
            +
                'ʸ': 'y',
         
     | 
| 152 | 
         
            +
                'ⁿ': 'n'
         
     | 
| 153 | 
         
            +
            }
         
     | 
| 154 | 
         
            +
             
     | 
| 155 | 
         
            +
            def replace_power(match) -> str:
         
     | 
| 156 | 
         
            +
                """
         
     | 
| 157 | 
         
            +
                Args:
         
     | 
| 158 | 
         
            +
                    match (re.Match)
         
     | 
| 159 | 
         
            +
                Returns:
         
     | 
| 160 | 
         
            +
                    str
         
     | 
| 161 | 
         
            +
                """
         
     | 
| 162 | 
         
            +
                power_num = ""
         
     | 
| 163 | 
         
            +
                for m in match.group(0):
         
     | 
| 164 | 
         
            +
                    power_num += power_map[m]
         
     | 
| 165 | 
         
            +
                result = "的" + power_num + "次方"
         
     | 
| 166 | 
         
            +
                return result
         
     | 
| 167 | 
         
            +
             
     | 
| 168 | 
         
            +
             
     | 
| 169 | 
         
            +
            # 数字表达式
         
     | 
| 170 | 
         
            +
            # 纯小数
         
     | 
| 171 | 
         
            +
            RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
         
     | 
| 172 | 
         
            +
            # 正整数 + 量词
         
     | 
| 173 | 
         
            +
            RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
         
     | 
| 174 | 
         
            +
            RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
         
     | 
| 175 | 
         
            +
             
     | 
| 176 | 
         
            +
             
     | 
| 177 | 
         
            +
            def replace_positive_quantifier(match) -> str:
         
     | 
| 178 | 
         
            +
                """
         
     | 
| 179 | 
         
            +
                Args:
         
     | 
| 180 | 
         
            +
                    match (re.Match)
         
     | 
| 181 | 
         
            +
                Returns:
         
     | 
| 182 | 
         
            +
                    str
         
     | 
| 183 | 
         
            +
                """
         
     | 
| 184 | 
         
            +
                number = match.group(1)
         
     | 
| 185 | 
         
            +
                match_2 = match.group(2)
         
     | 
| 186 | 
         
            +
                if match_2 == "+":
         
     | 
| 187 | 
         
            +
                    match_2 = "多"
         
     | 
| 188 | 
         
            +
                match_2: str = match_2 if match_2 else ""
         
     | 
| 189 | 
         
            +
                quantifiers: str = match.group(3)
         
     | 
| 190 | 
         
            +
                number: str = num2str(number)
         
     | 
| 191 | 
         
            +
                result = f"{number}{match_2}{quantifiers}"
         
     | 
| 192 | 
         
            +
                return result
         
     | 
| 193 | 
         
            +
             
     | 
| 194 | 
         
            +
             
     | 
| 195 | 
         
            +
            def replace_number(match) -> str:
         
     | 
| 196 | 
         
            +
                """
         
     | 
| 197 | 
         
            +
                Args:
         
     | 
| 198 | 
         
            +
                    match (re.Match)
         
     | 
| 199 | 
         
            +
                Returns:
         
     | 
| 200 | 
         
            +
                    str
         
     | 
| 201 | 
         
            +
                """
         
     | 
| 202 | 
         
            +
                sign = match.group(1)
         
     | 
| 203 | 
         
            +
                number = match.group(2)
         
     | 
| 204 | 
         
            +
                pure_decimal = match.group(5)
         
     | 
| 205 | 
         
            +
                if pure_decimal:
         
     | 
| 206 | 
         
            +
                    result = num2str(pure_decimal)
         
     | 
| 207 | 
         
            +
                else:
         
     | 
| 208 | 
         
            +
                    sign: str = "负" if sign else ""
         
     | 
| 209 | 
         
            +
                    number: str = num2str(number)
         
     | 
| 210 | 
         
            +
                    result = f"{sign}{number}"
         
     | 
| 211 | 
         
            +
                return result
         
     | 
| 212 | 
         
            +
             
     | 
| 213 | 
         
            +
             
     | 
| 214 | 
         
            +
            # 范围表达式
         
     | 
| 215 | 
         
            +
            # match.group(1) and match.group(8) are copy from RE_NUMBER
         
     | 
| 216 | 
         
            +
             
     | 
| 217 | 
         
            +
            RE_RANGE = re.compile(
         
     | 
| 218 | 
         
            +
                r"""
         
     | 
| 219 | 
         
            +
                (?<![\d\+\-\×÷=])      # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
         
     | 
| 220 | 
         
            +
                ((-?)((\d+)(\.\d+)?))  # 匹配范围起始的负数或正数(整数或小数)
         
     | 
| 221 | 
         
            +
                [-~]                   # 匹配范围分隔符
         
     | 
| 222 | 
         
            +
                ((-?)((\d+)(\.\d+)?))  # 匹配范围结束的负数或正数(整数或小数)
         
     | 
| 223 | 
         
            +
                (?![\d\+\-\×÷=])       # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
         
     | 
| 224 | 
         
            +
                """, re.VERBOSE)
         
     | 
| 225 | 
         
            +
             
     | 
| 226 | 
         
            +
             
     | 
| 227 | 
         
            +
            def replace_range(match) -> str:
         
     | 
| 228 | 
         
            +
                """
         
     | 
| 229 | 
         
            +
                Args:
         
     | 
| 230 | 
         
            +
                    match (re.Match)
         
     | 
| 231 | 
         
            +
                Returns:
         
     | 
| 232 | 
         
            +
                    str
         
     | 
| 233 | 
         
            +
                """
         
     | 
| 234 | 
         
            +
                first, second = match.group(1), match.group(6)
         
     | 
| 235 | 
         
            +
                first = RE_NUMBER.sub(replace_number, first)
         
     | 
| 236 | 
         
            +
                second = RE_NUMBER.sub(replace_number, second)
         
     | 
| 237 | 
         
            +
                result = f"{first}到{second}"
         
     | 
| 238 | 
         
            +
                return result
         
     | 
| 239 | 
         
            +
             
     | 
| 240 | 
         
            +
             
     | 
| 241 | 
         
            +
            # ~至表达式
         
     | 
| 242 | 
         
            +
            RE_TO_RANGE = re.compile(
         
     | 
| 243 | 
         
            +
                r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)')
         
     | 
| 244 | 
         
            +
             
     | 
| 245 | 
         
            +
            def replace_to_range(match) -> str:
         
     | 
| 246 | 
         
            +
                """
         
     | 
| 247 | 
         
            +
                Args:
         
     | 
| 248 | 
         
            +
                    match (re.Match)
         
     | 
| 249 | 
         
            +
                Returns:
         
     | 
| 250 | 
         
            +
                    str
         
     | 
| 251 | 
         
            +
                """
         
     | 
| 252 | 
         
            +
                result = match.group(0).replace('~', '至')
         
     | 
| 253 | 
         
            +
                return result
         
     | 
| 254 | 
         
            +
             
     | 
| 255 | 
         
            +
             
     | 
| 256 | 
         
            +
            def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
         
     | 
| 257 | 
         
            +
                stripped = value_string.lstrip('0')
         
     | 
| 258 | 
         
            +
                if len(stripped) == 0:
         
     | 
| 259 | 
         
            +
                    return []
         
     | 
| 260 | 
         
            +
                elif len(stripped) == 1:
         
     | 
| 261 | 
         
            +
                    if use_zero and len(stripped) < len(value_string):
         
     | 
| 262 | 
         
            +
                        return [DIGITS['0'], DIGITS[stripped]]
         
     | 
| 263 | 
         
            +
                    else:
         
     | 
| 264 | 
         
            +
                        return [DIGITS[stripped]]
         
     | 
| 265 | 
         
            +
                else:
         
     | 
| 266 | 
         
            +
                    largest_unit = next(
         
     | 
| 267 | 
         
            +
                        power for power in reversed(UNITS.keys()) if power < len(stripped))
         
     | 
| 268 | 
         
            +
                    first_part = value_string[:-largest_unit]
         
     | 
| 269 | 
         
            +
                    second_part = value_string[-largest_unit:]
         
     | 
| 270 | 
         
            +
                    return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(
         
     | 
| 271 | 
         
            +
                        second_part)
         
     | 
| 272 | 
         
            +
             
     | 
| 273 | 
         
            +
             
     | 
| 274 | 
         
            +
            def verbalize_cardinal(value_string: str) -> str:
         
     | 
| 275 | 
         
            +
                if not value_string:
         
     | 
| 276 | 
         
            +
                    return ''
         
     | 
| 277 | 
         
            +
             
     | 
| 278 | 
         
            +
                # 000 -> '零' , 0 -> '零'
         
     | 
| 279 | 
         
            +
                value_string = value_string.lstrip('0')
         
     | 
| 280 | 
         
            +
                if len(value_string) == 0:
         
     | 
| 281 | 
         
            +
                    return DIGITS['0']
         
     | 
| 282 | 
         
            +
             
     | 
| 283 | 
         
            +
                result_symbols = _get_value(value_string)
         
     | 
| 284 | 
         
            +
                # verbalized number starting with '一十*' is abbreviated as `十*`
         
     | 
| 285 | 
         
            +
                if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[
         
     | 
| 286 | 
         
            +
                        '1'] and result_symbols[1] == UNITS[1]:
         
     | 
| 287 | 
         
            +
                    result_symbols = result_symbols[1:]
         
     | 
| 288 | 
         
            +
                return ''.join(result_symbols)
         
     | 
| 289 | 
         
            +
             
     | 
| 290 | 
         
            +
             
     | 
| 291 | 
         
            +
            def verbalize_digit(value_string: str, alt_one=False) -> str:
         
     | 
| 292 | 
         
            +
                result_symbols = [DIGITS[digit] for digit in value_string]
         
     | 
| 293 | 
         
            +
                result = ''.join(result_symbols)
         
     | 
| 294 | 
         
            +
                if alt_one:
         
     | 
| 295 | 
         
            +
                    result = result.replace("一", "幺")
         
     | 
| 296 | 
         
            +
                return result
         
     | 
| 297 | 
         
            +
             
     | 
| 298 | 
         
            +
             
     | 
| 299 | 
         
            +
            def num2str(value_string: str) -> str:
         
     | 
| 300 | 
         
            +
                integer_decimal = value_string.split('.')
         
     | 
| 301 | 
         
            +
                if len(integer_decimal) == 1:
         
     | 
| 302 | 
         
            +
                    integer = integer_decimal[0]
         
     | 
| 303 | 
         
            +
                    decimal = ''
         
     | 
| 304 | 
         
            +
                elif len(integer_decimal) == 2:
         
     | 
| 305 | 
         
            +
                    integer, decimal = integer_decimal
         
     | 
| 306 | 
         
            +
                else:
         
     | 
| 307 | 
         
            +
                    raise ValueError(
         
     | 
| 308 | 
         
            +
                        f"The value string: '${value_string}' has more than one point in it."
         
     | 
| 309 | 
         
            +
                    )
         
     | 
| 310 | 
         
            +
             
     | 
| 311 | 
         
            +
                result = verbalize_cardinal(integer)
         
     | 
| 312 | 
         
            +
             
     | 
| 313 | 
         
            +
                decimal = decimal.rstrip('0')
         
     | 
| 314 | 
         
            +
                if decimal:
         
     | 
| 315 | 
         
            +
                    # '.22' is verbalized as '零点二二'
         
     | 
| 316 | 
         
            +
                    # '3.20' is verbalized as '三点二
         
     | 
| 317 | 
         
            +
                    result = result if result else "零"
         
     | 
| 318 | 
         
            +
                    result += '点' + verbalize_digit(decimal)
         
     | 
| 319 | 
         
            +
                return result
         
     | 
| 320 | 
         
            +
             
     | 
| 321 | 
         
            +
             
     | 
| 322 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 323 | 
         
            +
                
         
     | 
| 324 | 
         
            +
                text = ""
         
     | 
| 325 | 
         
            +
                text = num2str(text)
         
     | 
| 326 | 
         
            +
                print(text)
         
     | 
| 327 | 
         
            +
                pass
         
     | 
    	
        requirements.txt
    CHANGED
    
    | 
         @@ -23,7 +23,7 @@ pypinyin==0.53.0 
     | 
|
| 23 | 
         
             
            onnxruntime==1.20.1
         
     | 
| 24 | 
         
             
            Unidecode==1.3.8
         
     | 
| 25 | 
         
             
            phonemizer==3.3.0
         
     | 
| 26 | 
         
            -
            LangSegment==0.3.5
         
     | 
| 27 | 
         
             
            liger_kernel==0.5.4
         
     | 
| 28 | 
         
             
            openai==1.65.2
         
     | 
| 29 | 
         
             
            pydantic==2.10.6
         
     | 
| 
         | 
|
| 23 | 
         
             
            onnxruntime==1.20.1
         
     | 
| 24 | 
         
             
            Unidecode==1.3.8
         
     | 
| 25 | 
         
             
            phonemizer==3.3.0
         
     | 
| 26 | 
         
            +
            # LangSegment==0.3.5
         
     | 
| 27 | 
         
             
            liger_kernel==0.5.4
         
     | 
| 28 | 
         
             
            openai==1.65.2
         
     | 
| 29 | 
         
             
            pydantic==2.10.6
         
     |