nmstech commited on
Commit
6330193
·
verified ·
1 Parent(s): b9c10fd

Fix İ lowercase bug + apostrophe merge for BPE-split foreign words

Browse files
Files changed (1) hide show
  1. turk_tokenizer/_preprocessor.py +101 -35
turk_tokenizer/_preprocessor.py CHANGED
@@ -6,6 +6,12 @@ import re
6
 
7
  TR_CHARS = set("çğışöüÇĞİŞÖÜ")
8
 
 
 
 
 
 
 
9
  KNOWN_TURKISH_BASES = {
10
  "istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi",
11
  "cumhuriyet", "atatürk", "karadeniz", "marmara", "ege", "akdeniz",
@@ -21,6 +27,7 @@ KNOWN_FOREIGN_BASES = {
21
  "chatgpt", "openai", "claude", "gemini", "llama", "bert",
22
  "excel", "powerpoint", "outlook", "teams", "slack", "notion",
23
  "spotify", "netflix", "amazon", "alibaba", "huawei", "samsung",
 
24
  }
25
 
26
  TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
@@ -39,11 +46,10 @@ TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
39
  reverse=True,
40
  )
41
 
42
- _APO_SEP = "\ue001"
43
- _APO_RE = re.compile(
44
  r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
45
  )
46
- _CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')
47
 
48
 
49
  def _is_turkish_base(word: str) -> bool:
@@ -66,8 +72,8 @@ def _fix_all_caps(text: str) -> tuple[str, set]:
66
 
67
  def _replace(m: re.Match) -> str:
68
  w = m.group(1)
69
- caps.add(w.lower())
70
- return w.lower()
71
 
72
  return _CAPS_RE.sub(_replace, text), caps
73
 
@@ -77,7 +83,7 @@ def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
77
  i = 0
78
  while i < len(tokens):
79
  tok = tokens[i]
80
- raw_low = tok["token"].strip().lower()
81
 
82
  if tok["type"] == "ROOT" and raw_low in caps:
83
  result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
@@ -92,7 +98,7 @@ def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
92
  while j < len(tokens):
93
  nt = tokens[j]
94
  if not nt["token"].startswith(" "):
95
- combined += nt["token"].strip().lower()
96
  lookahead.append(nt)
97
  j += 1
98
  if combined in caps:
@@ -115,49 +121,109 @@ def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
115
 
116
 
117
  # ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- def _split_apostrophe(text: str) -> str:
120
  def _repl(m: re.Match) -> str:
121
  base, suffix = m.group(1), m.group(2)
122
  if _is_turkish_base(base):
123
- return m.group(0)
124
- if any(suffix.lower() == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
125
- return f"{base} {_APO_SEP} {suffix}"
 
 
126
  return m.group(0)
127
 
128
- return _APO_RE.sub(_repl, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
- def _merge_apostrophe_tokens(tokens: list[dict]) -> list[dict]:
132
- result: list[dict] = []
133
- i = 0
134
- while i < len(tokens):
135
- tok = tokens[i]
136
- if _APO_SEP in tok["token"].strip():
137
- if result:
138
- result[-1]["type"] = "ROOT"
139
- result[-1]["_foreign"] = True
140
- i += 1
141
- if i < len(tokens):
142
- tokens[i]["type"] = "SUFFIX"
143
- tokens[i]["_apo_suffix"] = True
144
- result.append(tokens[i])
145
- i += 1
146
- else:
147
- result.append(tok)
148
- i += 1
149
  return result
150
 
151
 
152
  # ── Combined pre / post ───────────────────────────────────────────────────────
153
 
154
- def preprocess(text: str) -> tuple[str, set]:
 
 
 
 
 
155
  text, caps = _fix_all_caps(text)
156
- text = _split_apostrophe(text)
157
- return text, caps
158
 
159
 
160
- def postprocess(tokens: list[dict], caps: set) -> list[dict]:
 
 
 
161
  tokens = _restore_caps_tokens(tokens, caps)
162
- tokens = _merge_apostrophe_tokens(tokens)
163
  return tokens
 
6
 
7
  TR_CHARS = set("çğışöüÇĞİŞÖÜ")
8
 
9
+
10
+ def _turkish_lower(s: str) -> str:
11
+ """Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower."""
12
+ return s.replace("İ", "i").replace("I", "ı").lower()
13
+
14
+
15
  KNOWN_TURKISH_BASES = {
16
  "istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi",
17
  "cumhuriyet", "atatürk", "karadeniz", "marmara", "ege", "akdeniz",
 
27
  "chatgpt", "openai", "claude", "gemini", "llama", "bert",
28
  "excel", "powerpoint", "outlook", "teams", "slack", "notion",
29
  "spotify", "netflix", "amazon", "alibaba", "huawei", "samsung",
30
+ "meeting", "tweet", "zoom", "email", "video",
31
  }
32
 
33
  TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
 
46
  reverse=True,
47
  )
48
 
49
+ _APO_RE = re.compile(
 
50
  r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
51
  )
52
+ _CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')
53
 
54
 
55
  def _is_turkish_base(word: str) -> bool:
 
72
 
73
  def _replace(m: re.Match) -> str:
74
  w = m.group(1)
75
+ caps.add(_turkish_lower(w))
76
+ return _turkish_lower(w)
77
 
78
  return _CAPS_RE.sub(_replace, text), caps
79
 
 
83
  i = 0
84
  while i < len(tokens):
85
  tok = tokens[i]
86
+ raw_low = _turkish_lower(tok["token"].strip())
87
 
88
  if tok["type"] == "ROOT" and raw_low in caps:
89
  result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
 
98
  while j < len(tokens):
99
  nt = tokens[j]
100
  if not nt["token"].startswith(" "):
101
+ combined += _turkish_lower(nt["token"].strip())
102
  lookahead.append(nt)
103
  j += 1
104
  if combined in caps:
 
121
 
122
 
123
  # ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
124
+ #
125
+ # Strategy: record (foreign_base, suffix) pairs, replace apostrophe with space.
126
+ # After tokenization, _merge_apostrophe_tokens uses these pairs to find the
127
+ # BPE pieces that form the foreign word and merge them into one FOREIGN ROOT,
128
+ # then marks the following word-initial suffix token as SUFFIX.
129
+ #
130
+ # Old approach used a \ue001 separator — the base tokenizer converts that to
131
+ # '<unknown>' so the separator was never found. Simple-space + pair-list is
132
+ # robust regardless of how the tokenizer handles the input.
133
+
134
+ def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]:
135
+ """
136
+ Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe → space).
137
+ Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]).
138
+ Turkish proper names (İstanbul'da) are left unchanged.
139
+ """
140
+ splits: list[tuple[str, str]] = []
141
 
 
142
  def _repl(m: re.Match) -> str:
143
  base, suffix = m.group(1), m.group(2)
144
  if _is_turkish_base(base):
145
+ return m.group(0) # leave Turkish names alone
146
+ sl = suffix.lower()
147
+ if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
148
+ splits.append((_turkish_lower(base), sl))
149
+ return f"{base} {suffix}" # just drop the apostrophe
150
  return m.group(0)
151
 
152
+ return _APO_RE.sub(_repl, text), splits
153
+
154
+
155
+ def _merge_apostrophe_tokens(
156
+ tokens: list[dict], apo_splits: list[tuple[str, str]]
157
+ ) -> list[dict]:
158
+ """
159
+ For each (foreign_base, suffix) pair recorded during _split_apostrophe,
160
+ find the consecutive BPE/ROOT pieces that together spell foreign_base,
161
+ merge them into one FOREIGN ROOT token, and mark the next word-initial
162
+ token whose stripped form == suffix as SUFFIX.
163
+ """
164
+ if not apo_splits:
165
+ return tokens
166
+
167
+ result = list(tokens)
168
+
169
+ for foreign_base, suffix in apo_splits:
170
+ n = len(result)
171
+ for j in range(1, n):
172
+ tok_j = result[j]
173
+ # Candidate suffix token: word-initial, stripped == suffix
174
+ if not tok_j["token"].startswith(" "):
175
+ continue
176
+ if _turkish_lower(tok_j["token"].strip()) != suffix:
177
+ continue
178
 
179
+ # Walk back to find pieces of the word before j (no leading space)
180
+ word_start = j - 1
181
+ while word_start > 0 and not result[word_start]["token"].startswith(" "):
182
+ word_start -= 1
183
+
184
+ pieces = result[word_start:j]
185
+ if not pieces:
186
+ continue
187
+
188
+ combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces)
189
+ if combined != foreign_base:
190
+ continue
191
+
192
+ # Merge pieces into one FOREIGN ROOT
193
+ merged = pieces[0]["token"] # keeps leading space
194
+ for p in pieces[1:]:
195
+ merged += p["token"].strip()
196
+
197
+ new_root = {"token": merged, "type": "ROOT", "_foreign": True}
198
+ new_suf = {**tok_j, "type": "SUFFIX", "_apo_suffix": True}
199
+
200
+ result = (
201
+ result[:word_start]
202
+ + [new_root, new_suf]
203
+ + result[j + 1:]
204
+ )
205
+ break # this pair is handled
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  return result
208
 
209
 
210
  # ── Combined pre / post ───────────────────────────────────────────────────────
211
 
212
+ def preprocess(text: str) -> tuple[str, set, list]:
213
+ """Prepare text before base tokenization.
214
+
215
+ Returns:
216
+ (modified_text, caps_set, apo_splits)
217
+ """
218
  text, caps = _fix_all_caps(text)
219
+ text, apo_splits = _split_apostrophe(text)
220
+ return text, caps, apo_splits
221
 
222
 
223
+ def postprocess(
224
+ tokens: list[dict], caps: set, apo_splits: list | None = None
225
+ ) -> list[dict]:
226
+ """Fix tokens after base tokenization."""
227
  tokens = _restore_caps_tokens(tokens, caps)
228
+ tokens = _merge_apostrophe_tokens(tokens, apo_splits or [])
229
  return tokens