Spaces:
Sleeping
Sleeping
ganti model
Browse files
app.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
# Fitur: ProxyFix, cookie Spaces, session permanent, /diag, preload, fallback,
|
| 4 |
# strip BOM JSON, sanitize adapter_config, endpoint /history + homepage responsive.
|
| 5 |
|
| 6 |
-
import os, re, json, codecs, pathlib, logging, threading, traceback, inspect
|
| 7 |
from datetime import datetime, timezone, timedelta
|
| 8 |
from functools import wraps
|
| 9 |
from flask import Flask, render_template, request, redirect, url_for, session, jsonify, flash
|
|
@@ -93,20 +93,124 @@ def login_required(fn):
|
|
| 93 |
return fn(*args, **kwargs)
|
| 94 |
return _wrap
|
| 95 |
|
| 96 |
-
# ---------- Prenorm ----------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
PAPUA_MAP = {
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
def prenorm(text: str) -> str:
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
| 106 |
|
| 107 |
# ---------- Model (lazy) + strip BOM + sanitize adapter ----------
|
| 108 |
-
|
| 109 |
-
|
|
|
|
| 110 |
DEVICE = "cuda" if os.getenv("DEVICE", "cpu") == "cuda" else "cpu"
|
| 111 |
|
| 112 |
TOK = None
|
|
@@ -271,6 +375,7 @@ def diag():
|
|
| 271 |
"model_error": _MODEL_ERROR,
|
| 272 |
"versions": {"python": sys.version, "torch": torch_v, "transformers": tf_v, "peft": peft_v},
|
| 273 |
"preload": PRELOAD_MODEL,
|
|
|
|
| 274 |
})
|
| 275 |
|
| 276 |
@app.get("/debug/session/set")
|
|
@@ -316,7 +421,7 @@ def register_get():
|
|
| 316 |
def register_post():
|
| 317 |
email = (request.form.get("email") or "").strip().lower()
|
| 318 |
pwd = (request.form.get("password") or "")
|
| 319 |
-
if not email
|
| 320 |
flash("Isi email dan password", "error"); return redirect(url_for("register_get"))
|
| 321 |
with SessionLocal() as s:
|
| 322 |
if s.query(User).filter_by(email=email).first():
|
|
@@ -372,7 +477,7 @@ def api_translate():
|
|
| 372 |
if not text:
|
| 373 |
return jsonify({"ok": False, "error": "Empty text"}), 400
|
| 374 |
try:
|
| 375 |
-
clean = prenorm(text)
|
| 376 |
mt = f"[FAKE] {clean}" if FALLBACK_TRANSLATE else translate_with_model(clean, max_new_tokens=max_new)
|
| 377 |
with SessionLocal() as s:
|
| 378 |
s.add(Translation(user_id=session["uid"], src=text, mt=mt))
|
|
|
|
| 3 |
# Fitur: ProxyFix, cookie Spaces, session permanent, /diag, preload, fallback,
|
| 4 |
# strip BOM JSON, sanitize adapter_config, endpoint /history + homepage responsive.
|
| 5 |
|
| 6 |
+
import os, re, json, codecs, pathlib, logging, threading, traceback, inspect, unicodedata
|
| 7 |
from datetime import datetime, timezone, timedelta
|
| 8 |
from functools import wraps
|
| 9 |
from flask import Flask, render_template, request, redirect, url_for, session, jsonify, flash
|
|
|
|
| 93 |
return fn(*args, **kwargs)
|
| 94 |
return _wrap
|
| 95 |
|
| 96 |
+
# ---------- Prenorm (enhanced, controllable via ENV) ----------
|
| 97 |
+
# ENV:
|
| 98 |
+
# PRENORM_LEVEL = off | basic | aggressive (default: basic)
|
| 99 |
+
# PRENORM_DEBUG = 1 to log internal trace (response API tidak berubah)
|
| 100 |
+
PRENORM_LEVEL = os.getenv("PRENORM_LEVEL", "basic").lower()
|
| 101 |
+
PRENORM_DEBUG = os.getenv("PRENORM_DEBUG", "0") == "1"
|
| 102 |
+
|
| 103 |
+
WS_RE = re.compile(r"\s+")
|
| 104 |
+
ELONG_RE = re.compile(r"([bcdfghjklmnpqrstvwxyz])\1{2,}", flags=re.IGNORECASE) # konsonan >=3x
|
| 105 |
+
PUNC_RE = re.compile(r"[^\w\s,.;:?!%()\-\β/]|_")
|
| 106 |
+
MULTI_PUNC = re.compile(r"([,.;:?!])\1+")
|
| 107 |
+
DASH_SPACES= re.compile(r"\s*([-β/])\s*")
|
| 108 |
+
|
| 109 |
+
WHITELIST_KEEP_ELONG = {"papua", "koteka", "wamena", "sarmi", "sorong"}
|
| 110 |
+
|
| 111 |
PAPUA_MAP = {
|
| 112 |
+
"sa": "saya", "sy": "saya", "beta": "saya",
|
| 113 |
+
"ko": "kamu", "kau": "kamu",
|
| 114 |
+
"dong": "mereka", "kam": "kalian", "kamong": "kalian",
|
| 115 |
+
"kitong": "kita", "kitorang": "kita", "torang": "kita",
|
| 116 |
+
"tra": "tidak", "tr": "tidak", "trada": "tidak ada", "son": "tidak", "ndak": "tidak", "tid": "tidak",
|
| 117 |
+
"mo": "mau", "su": "sudah", "uda": "sudah",
|
| 118 |
+
"skarang": "sekarang", "td": "tadi", "tar": "nanti", "tarlah": "nanti",
|
| 119 |
+
"pigi": "pergi", "pi": "pergi",
|
| 120 |
+
"ma": "sama", "deng": "dengan", "dgn": "dengan",
|
| 121 |
+
"kira2": "kira-kira", "bgmn": "bagaimana", "gmn": "bagaimana",
|
| 122 |
}
|
| 123 |
+
|
| 124 |
+
PRON_MAP = {
|
| 125 |
+
"sa": "saya", "saya": "saya",
|
| 126 |
+
"ko": "kamu", "kamu": "kamu",
|
| 127 |
+
"dia": "dia", "dong": "mereka",
|
| 128 |
+
"kam": "kalian", "kalian": "kalian",
|
| 129 |
+
"kitong": "kita", "kitorang": "kita", "kita": "kita", "torang": "kita",
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
def _normalize_unicode(text: str) -> str:
|
| 133 |
+
return unicodedata.normalize("NFKC", text)
|
| 134 |
+
|
| 135 |
+
def _strip_emoji_and_noise(text: str) -> str:
|
| 136 |
+
text = PUNC_RE.sub(" ", text)
|
| 137 |
+
text = MULTI_PUNC.sub(r"\1", text)
|
| 138 |
+
text = DASH_SPACES.sub(r" \1 ", text)
|
| 139 |
+
return text
|
| 140 |
+
|
| 141 |
+
def _normalize_ws(text: str) -> str:
|
| 142 |
+
return WS_RE.sub(" ", text).strip()
|
| 143 |
+
|
| 144 |
+
def _reduce_elongation(token: str) -> str:
|
| 145 |
+
base = token.lower()
|
| 146 |
+
if base in WHITELIST_KEEP_ELONG:
|
| 147 |
+
return token
|
| 148 |
+
return ELONG_RE.sub(r"\1\1", token)
|
| 149 |
+
|
| 150 |
+
def _apply_papua_map(token: str) -> str:
|
| 151 |
+
low = token.lower()
|
| 152 |
+
return PAPUA_MAP.get(low, token)
|
| 153 |
+
|
| 154 |
+
def _handle_pu_constructs(text: str) -> str:
|
| 155 |
+
# "<pronoun> pu <X>" -> "punya <pronoun_std> <X>"
|
| 156 |
+
def repl(m):
|
| 157 |
+
pron = m.group(1).lower()
|
| 158 |
+
rest = m.group(2).strip()
|
| 159 |
+
pron_std = PRON_MAP.get(pron, pron)
|
| 160 |
+
return f"punya {pron_std} {rest}"
|
| 161 |
+
return re.sub(r"\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s*pu\s+([^.,;:!?]+)",
|
| 162 |
+
repl, text, flags=re.IGNORECASE)
|
| 163 |
+
|
| 164 |
+
def _token_level_ops(text: str, aggressive: bool) -> str:
|
| 165 |
+
tokens = text.split()
|
| 166 |
+
out = []
|
| 167 |
+
for t in tokens:
|
| 168 |
+
t2 = _reduce_elongation(t) if aggressive else t
|
| 169 |
+
t3 = _apply_papua_map(t2)
|
| 170 |
+
out.append(t3)
|
| 171 |
+
return " ".join(out)
|
| 172 |
+
|
| 173 |
+
def papua_prenorm(inp: str, level: str = "basic", return_trace: bool = False):
|
| 174 |
+
"""
|
| 175 |
+
level:
|
| 176 |
+
- "off": tanpa perubahan
|
| 177 |
+
- "basic": unicode + ws + map slang + 'pu' constructs
|
| 178 |
+
- "aggressive": basic + reduksi huruf berulang + pembersihan simbol non-teks
|
| 179 |
+
"""
|
| 180 |
+
trace = {"level": level}
|
| 181 |
+
if level == "off":
|
| 182 |
+
return (inp, trace) if return_trace else inp
|
| 183 |
+
|
| 184 |
+
s0 = inp
|
| 185 |
+
s1 = _normalize_unicode(s0)
|
| 186 |
+
s2 = _strip_emoji_and_noise(s1) if level == "aggressive" else s1
|
| 187 |
+
s3 = _normalize_ws(s2)
|
| 188 |
+
s4 = _handle_pu_constructs(s3)
|
| 189 |
+
s5 = _token_level_ops(s4, aggressive=(level == "aggressive"))
|
| 190 |
+
s6 = _normalize_ws(s5)
|
| 191 |
+
|
| 192 |
+
if return_trace:
|
| 193 |
+
trace.update({
|
| 194 |
+
"original": s0, "unicode_norm": s1,
|
| 195 |
+
"strip_noise": s2 if level == "aggressive" else "(skip)",
|
| 196 |
+
"ws_norm_1": s3, "pu_constructs": s4,
|
| 197 |
+
"token_ops": s5, "final": s6,
|
| 198 |
+
})
|
| 199 |
+
return s6, trace
|
| 200 |
+
return s6
|
| 201 |
+
|
| 202 |
+
# Wrapper publik yang dipakai endpoint (signature sama seperti sebelumnya)
|
| 203 |
def prenorm(text: str) -> str:
|
| 204 |
+
if PRENORM_DEBUG:
|
| 205 |
+
out, tr = papua_prenorm(text, level=PRENORM_LEVEL, return_trace=True)
|
| 206 |
+
log.info("[PRENORM][%s] %s -> %s | trace=%s", PRENORM_LEVEL, text, out, json.dumps(tr, ensure_ascii=False))
|
| 207 |
+
return out
|
| 208 |
+
return papua_prenorm(text, level=PRENORM_LEVEL, return_trace=False)
|
| 209 |
|
| 210 |
# ---------- Model (lazy) + strip BOM + sanitize adapter ----------
|
| 211 |
+
# Default diarahkan ke repo kamu di Hugging Face.
|
| 212 |
+
BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "amosnbn/cendol-mt5-base-inst")
|
| 213 |
+
ADAPTER_ID = os.getenv("ADAPTER_ID", "amosnbn/papua-lora-ckpt-168")
|
| 214 |
DEVICE = "cuda" if os.getenv("DEVICE", "cpu") == "cuda" else "cpu"
|
| 215 |
|
| 216 |
TOK = None
|
|
|
|
| 375 |
"model_error": _MODEL_ERROR,
|
| 376 |
"versions": {"python": sys.version, "torch": torch_v, "transformers": tf_v, "peft": peft_v},
|
| 377 |
"preload": PRELOAD_MODEL,
|
| 378 |
+
"prenorm": {"level": PRENORM_LEVEL, "debug": PRENORM_DEBUG},
|
| 379 |
})
|
| 380 |
|
| 381 |
@app.get("/debug/session/set")
|
|
|
|
| 421 |
def register_post():
|
| 422 |
email = (request.form.get("email") or "").strip().lower()
|
| 423 |
pwd = (request.form.get("password") or "")
|
| 424 |
+
if not email atau not pwd:
|
| 425 |
flash("Isi email dan password", "error"); return redirect(url_for("register_get"))
|
| 426 |
with SessionLocal() as s:
|
| 427 |
if s.query(User).filter_by(email=email).first():
|
|
|
|
| 477 |
if not text:
|
| 478 |
return jsonify({"ok": False, "error": "Empty text"}), 400
|
| 479 |
try:
|
| 480 |
+
clean = prenorm(text) # <- tetap sama namanya, sekarang lebih pintar
|
| 481 |
mt = f"[FAKE] {clean}" if FALLBACK_TRANSLATE else translate_with_model(clean, max_new_tokens=max_new)
|
| 482 |
with SessionLocal() as s:
|
| 483 |
s.add(Translation(user_id=session["uid"], src=text, mt=mt))
|