amosnbn commited on
Commit
25480c6
Β·
1 Parent(s): 548cd9e
Files changed (1) hide show
  1. app.py +13 -13
app.py CHANGED
@@ -92,11 +92,11 @@ def login_required(fn):
92
  PRENORM_LEVEL = os.getenv("PRENORM_LEVEL", "basic").lower()
93
  PRENORM_DEBUG = os.getenv("PRENORM_DEBUG", "0") == "1"
94
 
95
- WS_RE = re.compile(r"\s+")
96
- ELONG_RE = re.compile(r"([bcdfghjklmnpqrstvwxyz])\1{2,}", flags=re.IGNORECASE)
97
- PUNC_RE = re.compile(r"[^\w\s,.;:?!%()\-\β€”/]|_")
98
- MULTI_PUNC = re.compile(r"([,.;:?!])\1+")
99
- DASH_SPACES= re.compile(r"\s*([-β€”/])\s*")
100
 
101
  WHITELIST_KEEP_ELONG = {"papua", "koteka", "wamena", "sarmi", "sorong"}
102
 
@@ -149,7 +149,7 @@ def _handle_pu_constructs(text: str) -> str:
149
  rest = m.group(2).strip()
150
  pron_std = PRON_MAP.get(pron, pron)
151
  return f"punya {pron_std} {rest}"
152
- return re.sub(r"\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s*pu\s+([^.,;:!?]+)",
153
  repl, text, flags=re.IGNORECASE)
154
 
155
  def _handle_mo_bigram(text: str) -> str:
@@ -157,7 +157,7 @@ def _handle_mo_bigram(text: str) -> str:
157
  pron = m.group(1).lower()
158
  pron_std = PRON_MAP.get(pron, pron)
159
  return f"{pron_std} mau"
160
- return re.sub(r"\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s+mo\b",
161
  repl, text, flags=re.IGNORECASE)
162
 
163
  def _handle_negation_bigrams(text: str) -> str:
@@ -165,13 +165,13 @@ def _handle_negation_bigrams(text: str) -> str:
165
  pron = m.group(1).lower()
166
  pron_std = PRON_MAP.get(pron, pron)
167
  return f"{pron_std} tidak"
168
- text = re.sub(r"\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s+(tra|ndak|son|tid)\b",
169
  repl_pron, text, flags=re.IGNORECASE)
170
- text = re.sub(r"\btra\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
171
- text = re.sub(r"\bndak\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
172
- text = re.sub(r"\btid\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
173
- text = re.sub(r"\bson\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
174
- text = re.sub(r"\btidak\s+tau\b", "tidak tahu", text, flags=re.IGNORECASE)
175
  return text
176
 
177
  def _token_level_ops(text: str, aggressive: bool) -> str:
 
92
  PRENORM_LEVEL = os.getenv("PRENORM_LEVEL", "basic").lower()
93
  PRENORM_DEBUG = os.getenv("PRENORM_DEBUG", "0") == "1"
94
 
95
+ WS_RE = re.compile("\s+")
96
+ ELONG_RE = re.compile("([bcdfghjklmnpqrstvwxyz])\1{2,}", flags=re.IGNORECASE)
97
+ PUNC_RE = re.compile("[^\w\s,.;:?!%()\-\β€”/]|_")
98
+ MULTI_PUNC = re.compile("([,.;:?!])\1+")
99
+ DASH_SPACES= re.compile("\s*([-β€”/])\s*")
100
 
101
  WHITELIST_KEEP_ELONG = {"papua", "koteka", "wamena", "sarmi", "sorong"}
102
 
 
149
  rest = m.group(2).strip()
150
  pron_std = PRON_MAP.get(pron, pron)
151
  return f"punya {pron_std} {rest}"
152
+ return re.sub("\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s*pu\s+([^.,;:!?]+)",
153
  repl, text, flags=re.IGNORECASE)
154
 
155
  def _handle_mo_bigram(text: str) -> str:
 
157
  pron = m.group(1).lower()
158
  pron_std = PRON_MAP.get(pron, pron)
159
  return f"{pron_std} mau"
160
+ return re.sub("\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s+mo\b",
161
  repl, text, flags=re.IGNORECASE)
162
 
163
  def _handle_negation_bigrams(text: str) -> str:
 
165
  pron = m.group(1).lower()
166
  pron_std = PRON_MAP.get(pron, pron)
167
  return f"{pron_std} tidak"
168
+ text = re.sub("\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s+(tra|ndak|son|tid)\b",
169
  repl_pron, text, flags=re.IGNORECASE)
170
+ text = re.sub("\btra\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
171
+ text = re.sub("\bndak\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
172
+ text = re.sub("\btid\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
173
+ text = re.sub("\bson\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
174
+ text = re.sub("\btidak\s+tau\b", "tidak tahu", text, flags=re.IGNORECASE)
175
  return text
176
 
177
  def _token_level_ops(text: str, aggressive: bool) -> str: