Spaces:
Running
Running
final1.2
Browse files
app.py
CHANGED
|
@@ -92,11 +92,11 @@ def login_required(fn):
|
|
| 92 |
PRENORM_LEVEL = os.getenv("PRENORM_LEVEL", "basic").lower()
|
| 93 |
PRENORM_DEBUG = os.getenv("PRENORM_DEBUG", "0") == "1"
|
| 94 |
|
| 95 |
-
WS_RE = re.compile(
|
| 96 |
-
ELONG_RE = re.compile(
|
| 97 |
-
PUNC_RE = re.compile(
|
| 98 |
-
MULTI_PUNC = re.compile(
|
| 99 |
-
DASH_SPACES= re.compile(
|
| 100 |
|
| 101 |
WHITELIST_KEEP_ELONG = {"papua", "koteka", "wamena", "sarmi", "sorong"}
|
| 102 |
|
|
@@ -149,7 +149,7 @@ def _handle_pu_constructs(text: str) -> str:
|
|
| 149 |
rest = m.group(2).strip()
|
| 150 |
pron_std = PRON_MAP.get(pron, pron)
|
| 151 |
return f"punya {pron_std} {rest}"
|
| 152 |
-
return re.sub(
|
| 153 |
repl, text, flags=re.IGNORECASE)
|
| 154 |
|
| 155 |
def _handle_mo_bigram(text: str) -> str:
|
|
@@ -157,7 +157,7 @@ def _handle_mo_bigram(text: str) -> str:
|
|
| 157 |
pron = m.group(1).lower()
|
| 158 |
pron_std = PRON_MAP.get(pron, pron)
|
| 159 |
return f"{pron_std} mau"
|
| 160 |
-
return re.sub(
|
| 161 |
repl, text, flags=re.IGNORECASE)
|
| 162 |
|
| 163 |
def _handle_negation_bigrams(text: str) -> str:
|
|
@@ -165,13 +165,13 @@ def _handle_negation_bigrams(text: str) -> str:
|
|
| 165 |
pron = m.group(1).lower()
|
| 166 |
pron_std = PRON_MAP.get(pron, pron)
|
| 167 |
return f"{pron_std} tidak"
|
| 168 |
-
text = re.sub(
|
| 169 |
repl_pron, text, flags=re.IGNORECASE)
|
| 170 |
-
text = re.sub(
|
| 171 |
-
text = re.sub(
|
| 172 |
-
text = re.sub(
|
| 173 |
-
text = re.sub(
|
| 174 |
-
text = re.sub(
|
| 175 |
return text
|
| 176 |
|
| 177 |
def _token_level_ops(text: str, aggressive: bool) -> str:
|
|
|
|
| 92 |
PRENORM_LEVEL = os.getenv("PRENORM_LEVEL", "basic").lower()
|
| 93 |
PRENORM_DEBUG = os.getenv("PRENORM_DEBUG", "0") == "1"
|
| 94 |
|
| 95 |
+
WS_RE = re.compile("\s+")
|
| 96 |
+
ELONG_RE = re.compile("([bcdfghjklmnpqrstvwxyz])\1{2,}", flags=re.IGNORECASE)
|
| 97 |
+
PUNC_RE = re.compile("[^\w\s,.;:?!%()\-\β/]|_")
|
| 98 |
+
MULTI_PUNC = re.compile("([,.;:?!])\1+")
|
| 99 |
+
DASH_SPACES= re.compile("\s*([-β/])\s*")
|
| 100 |
|
| 101 |
WHITELIST_KEEP_ELONG = {"papua", "koteka", "wamena", "sarmi", "sorong"}
|
| 102 |
|
|
|
|
| 149 |
rest = m.group(2).strip()
|
| 150 |
pron_std = PRON_MAP.get(pron, pron)
|
| 151 |
return f"punya {pron_std} {rest}"
|
| 152 |
+
return re.sub("\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s*pu\s+([^.,;:!?]+)",
|
| 153 |
repl, text, flags=re.IGNORECASE)
|
| 154 |
|
| 155 |
def _handle_mo_bigram(text: str) -> str:
|
|
|
|
| 157 |
pron = m.group(1).lower()
|
| 158 |
pron_std = PRON_MAP.get(pron, pron)
|
| 159 |
return f"{pron_std} mau"
|
| 160 |
+
return re.sub("\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s+mo\b",
|
| 161 |
repl, text, flags=re.IGNORECASE)
|
| 162 |
|
| 163 |
def _handle_negation_bigrams(text: str) -> str:
|
|
|
|
| 165 |
pron = m.group(1).lower()
|
| 166 |
pron_std = PRON_MAP.get(pron, pron)
|
| 167 |
return f"{pron_std} tidak"
|
| 168 |
+
text = re.sub("\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s+(tra|ndak|son|tid)\b",
|
| 169 |
repl_pron, text, flags=re.IGNORECASE)
|
| 170 |
+
text = re.sub("\btra\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
|
| 171 |
+
text = re.sub("\bndak\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
|
| 172 |
+
text = re.sub("\btid\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
|
| 173 |
+
text = re.sub("\bson\s+ada\b", "tidak ada", text, flags=re.IGNORECASE)
|
| 174 |
+
text = re.sub("\btidak\s+tau\b", "tidak tahu", text, flags=re.IGNORECASE)
|
| 175 |
return text
|
| 176 |
|
| 177 |
def _token_level_ops(text: str, aggressive: bool) -> str:
|