| """ |
| ========================================== |
| Code-Switching Router |
| ========================================== |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
| from dataclasses import dataclass |
|
|
| import tiktoken |
|
|
| @dataclass |
| class TextSegment: |
| text: str |
| language: str |
| has_leading_space: bool = False |
|
|
|
|
| |
| |
| |
|
|
| class CodeSwitchSegmenter: |
| def __init__(self, language_blocks: dict[str, list[tuple[int, int]]] = None): |
| """ |
| language_blocks: maps language name (e.g. 'sinhala') to a list of (start_cp, end_cp) inclusive |
| """ |
| self._ranges: list[tuple[int, int, str]] = [] |
| if language_blocks: |
| for lang, blocks in language_blocks.items(): |
| for start, end in blocks: |
| self._ranges.append((start, end, lang)) |
|
|
| def _get_char_language(self, ch: str) -> Optional[str]: |
| if ch in ('\u200C', '\u200D'): |
| return "__joiner__" |
| cp = ord(ch) |
| for start, end, lang in self._ranges: |
| if start <= cp <= end: |
| return lang |
| return None |
|
|
| def segment(self, text: str) -> list[TextSegment]: |
| if not text: |
| return [] |
|
|
| segments: list[TextSegment] = [] |
| n = len(text) |
| pos = 0 |
|
|
| while pos < n: |
| ch = text[pos] |
| ch_lang = self._get_char_language(ch) |
|
|
| is_indic_start = (ch_lang is not None) |
|
|
| if not is_indic_start: |
| |
| start = pos |
| while pos < n: |
| ch2 = text[pos] |
| lang2 = self._get_char_language(ch2) |
| if lang2 is not None and lang2 != "__joiner__": |
| break |
| pos += 1 |
| |
| latino_only = text[start:pos] |
| |
| has_ls = False |
| if pos < n and latino_only.endswith(" "): |
| latino_only = latino_only[:-1] |
| has_ls = True |
| |
| if latino_only: |
| segments.append(TextSegment(text=latino_only, language="latin")) |
|
|
| if has_ls and pos < n: |
| indic_start = pos |
| current_lang = self._get_char_language(text[pos]) |
| if current_lang == "__joiner__" or current_lang is None: |
| current_lang = "__unknown__" |
| |
| while pos < n: |
| c = text[pos] |
| c_lang = self._get_char_language(c) |
| if c_lang == "__joiner__": |
| pos += 1 |
| elif c_lang is not None: |
| if current_lang == "__unknown__": |
| current_lang = c_lang |
| elif c_lang != current_lang: |
| break |
| pos += 1 |
| else: |
| break |
| |
| segments.append(TextSegment( |
| text=text[indic_start:pos], |
| language=current_lang, |
| has_leading_space=True |
| )) |
| else: |
| indic_start = pos |
| current_lang = ch_lang |
| |
| while pos < n: |
| c = text[pos] |
| c_lang = self._get_char_language(c) |
| if c_lang == "__joiner__": |
| pos += 1 |
| elif c_lang is not None: |
| if c_lang != current_lang: |
| break |
| pos += 1 |
| else: |
| break |
| |
| segments.append(TextSegment( |
| text=text[indic_start:pos], |
| language=current_lang, |
| has_leading_space=False |
| )) |
|
|
| return segments |
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| test_cases = [ |
| |
| "ශ්රී ලංකාව", |
| |
| "Hello, world!", |
| |
| "The capital is කොළඹ.", |
| |
| "ලංකාව is beautiful.", |
| |
| "Hello नमस्ते world", |
| |
| "2026 AI සහ machine learning", |
| |
| "GPT-4 ශ්රී ලංකා", |
| |
| "ආචාර්යවරයාගේ වෛද්ය විද්යා පර්යේෂණය සාර්ථකයි.", |
| |
| "विद्यालय में पढ़ाई होती है।", |
| |
| "AI (Artificial Intelligence) සහ देवनागरी text.", |
| ] |
|
|
| language_blocks = { |
| "sinhala": [(0x0d80, 0x0dff)], |
| "devanagari": [(0x0900, 0x097f)] |
| } |
| seg = CodeSwitchSegmenter(language_blocks) |
| |
| for text in test_cases: |
| blocks = seg.segment(text) |
| print(f"\n Input : {text!r}") |
| print(f" Blocks : {[(b.text, b.language, b.has_leading_space) for b in blocks]}") |
|
|