Spaces:
Sleeping
Sleeping
Update
Browse files
app.py
CHANGED
|
@@ -16,10 +16,8 @@ log = logging.getLogger("papua-app")
|
|
| 16 |
|
| 17 |
# ---------- Flask ----------
|
| 18 |
app = Flask(__name__, template_folder="frontend", static_folder="static")
|
| 19 |
-
# trust HF proxy (HTTPS/host), penting untuk cookie Secure & redirect
|
| 20 |
app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1)
|
| 21 |
|
| 22 |
-
# session config (HF Spaces iframe-friendly)
|
| 23 |
app.config.update(
|
| 24 |
SECRET_KEY=os.getenv("SECRET_KEY", "dev-secret-change-me"),
|
| 25 |
SESSION_COOKIE_NAME="hfspace_session",
|
|
@@ -95,15 +93,12 @@ def login_required(fn):
|
|
| 95 |
return fn(*args, **kwargs)
|
| 96 |
return _wrap
|
| 97 |
|
| 98 |
-
# ---------- Prenorm
|
| 99 |
-
# ENV:
|
| 100 |
-
# PRENORM_LEVEL = off | basic | aggressive (default: basic)
|
| 101 |
-
# PRENORM_DEBUG = 1 to log internal trace (response API tidak berubah)
|
| 102 |
PRENORM_LEVEL = os.getenv("PRENORM_LEVEL", "basic").lower()
|
| 103 |
PRENORM_DEBUG = os.getenv("PRENORM_DEBUG", "0") == "1"
|
| 104 |
|
| 105 |
WS_RE = re.compile(r"\s+")
|
| 106 |
-
ELONG_RE = re.compile(r"([bcdfghjklmnpqrstvwxyz])\1{2,}", flags=re.IGNORECASE)
|
| 107 |
PUNC_RE = re.compile(r"[^\w\s,.;:?!%()\-\β/]|_")
|
| 108 |
MULTI_PUNC = re.compile(r"([,.;:?!])\1+")
|
| 109 |
DASH_SPACES= re.compile(r"\s*([-β/])\s*")
|
|
@@ -122,7 +117,6 @@ PAPUA_MAP = {
|
|
| 122 |
"ma": "sama", "deng": "dengan", "dgn": "dengan",
|
| 123 |
"kira2": "kira-kira", "bgmn": "bagaimana", "gmn": "bagaimana",
|
| 124 |
}
|
| 125 |
-
|
| 126 |
PRON_MAP = {
|
| 127 |
"sa": "saya", "saya": "saya",
|
| 128 |
"ko": "kamu", "kamu": "kamu",
|
|
@@ -133,28 +127,21 @@ PRON_MAP = {
|
|
| 133 |
|
| 134 |
def _normalize_unicode(text: str) -> str:
|
| 135 |
return unicodedata.normalize("NFKC", text)
|
| 136 |
-
|
| 137 |
def _strip_emoji_and_noise(text: str) -> str:
|
| 138 |
text = PUNC_RE.sub(" ", text)
|
| 139 |
text = MULTI_PUNC.sub(r"\1", text)
|
| 140 |
text = DASH_SPACES.sub(r" \1 ", text)
|
| 141 |
return text
|
| 142 |
-
|
| 143 |
def _normalize_ws(text: str) -> str:
|
| 144 |
return WS_RE.sub(" ", text).strip()
|
| 145 |
-
|
| 146 |
def _reduce_elongation(token: str) -> str:
|
| 147 |
base = token.lower()
|
| 148 |
-
if base in WHITELIST_KEEP_ELONG:
|
| 149 |
-
return token
|
| 150 |
return ELONG_RE.sub(r"\1\1", token)
|
| 151 |
-
|
| 152 |
def _apply_papua_map(token: str) -> str:
|
| 153 |
low = token.lower()
|
| 154 |
return PAPUA_MAP.get(low, token)
|
| 155 |
-
|
| 156 |
def _handle_pu_constructs(text: str) -> str:
|
| 157 |
-
# "<pronoun> pu <X>" -> "punya <pronoun_std> <X>"
|
| 158 |
def repl(m):
|
| 159 |
pron = m.group(1).lower()
|
| 160 |
rest = m.group(2).strip()
|
|
@@ -162,7 +149,6 @@ def _handle_pu_constructs(text: str) -> str:
|
|
| 162 |
return f"punya {pron_std} {rest}"
|
| 163 |
return re.sub(r"\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s*pu\s+([^.,;:!?]+)",
|
| 164 |
repl, text, flags=re.IGNORECASE)
|
| 165 |
-
|
| 166 |
def _token_level_ops(text: str, aggressive: bool) -> str:
|
| 167 |
tokens = text.split()
|
| 168 |
out = []
|
|
@@ -173,16 +159,9 @@ def _token_level_ops(text: str, aggressive: bool) -> str:
|
|
| 173 |
return " ".join(out)
|
| 174 |
|
| 175 |
def papua_prenorm(inp: str, level: str = "basic", return_trace: bool = False):
|
| 176 |
-
"""
|
| 177 |
-
level:
|
| 178 |
-
- "off": tanpa perubahan
|
| 179 |
-
- "basic": unicode + ws + map slang + 'pu' constructs
|
| 180 |
-
- "aggressive": basic + reduksi huruf berulang + pembersihan simbol non-teks
|
| 181 |
-
"""
|
| 182 |
trace = {"level": level}
|
| 183 |
if level == "off":
|
| 184 |
return (inp, trace) if return_trace else inp
|
| 185 |
-
|
| 186 |
s0 = inp
|
| 187 |
s1 = _normalize_unicode(s0)
|
| 188 |
s2 = _strip_emoji_and_noise(s1) if level == "aggressive" else s1
|
|
@@ -190,18 +169,14 @@ def papua_prenorm(inp: str, level: str = "basic", return_trace: bool = False):
|
|
| 190 |
s4 = _handle_pu_constructs(s3)
|
| 191 |
s5 = _token_level_ops(s4, aggressive=(level == "aggressive"))
|
| 192 |
s6 = _normalize_ws(s5)
|
| 193 |
-
|
| 194 |
if return_trace:
|
| 195 |
-
trace.update({
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
"token_ops": s5, "final": s6,
|
| 200 |
-
})
|
| 201 |
return s6, trace
|
| 202 |
return s6
|
| 203 |
|
| 204 |
-
# Wrapper publik yang dipakai endpoint
|
| 205 |
def prenorm(text: str) -> str:
|
| 206 |
if PRENORM_DEBUG:
|
| 207 |
out, tr = papua_prenorm(text, level=PRENORM_LEVEL, return_trace=True)
|
|
@@ -209,8 +184,7 @@ def prenorm(text: str) -> str:
|
|
| 209 |
return out
|
| 210 |
return papua_prenorm(text, level=PRENORM_LEVEL, return_trace=False)
|
| 211 |
|
| 212 |
-
# ---------- Model (lazy)
|
| 213 |
-
# Default diarahkan ke repo kamu di Hugging Face.
|
| 214 |
BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "amosnbn/cendol-mt5-base-inst")
|
| 215 |
ADAPTER_ID = os.getenv("ADAPTER_ID", "amosnbn/papua-lora-ckpt-168")
|
| 216 |
DEVICE = "cuda" if os.getenv("DEVICE", "cpu") == "cuda" else "cpu"
|
|
@@ -271,27 +245,15 @@ def _sanitize_adapter_config(adapter_dir: str):
|
|
| 271 |
json.dump(cleaned, f, ensure_ascii=False, indent=2)
|
| 272 |
|
| 273 |
def _load_model():
|
| 274 |
-
"""Download ke /tmp, strip BOM, sanitize adapter_config, lalu load."""
|
| 275 |
global TOK, MODEL, _MODEL_READY, _MODEL_ERROR
|
| 276 |
try:
|
| 277 |
log.info("[MODEL] downloading base=%s adapter=%s", BASE_MODEL_ID, ADAPTER_ID or "-")
|
| 278 |
-
|
| 279 |
-
base_dir = snapshot_download(
|
| 280 |
-
repo_id=BASE_MODEL_ID,
|
| 281 |
-
local_dir="/tmp/hf_base",
|
| 282 |
-
local_dir_use_symlinks=False,
|
| 283 |
-
allow_patterns=None,
|
| 284 |
-
)
|
| 285 |
_strip_bom_in_dir(base_dir)
|
| 286 |
|
| 287 |
adapter_dir = None
|
| 288 |
if ADAPTER_ID:
|
| 289 |
-
adapter_dir = snapshot_download(
|
| 290 |
-
repo_id=ADAPTER_ID,
|
| 291 |
-
local_dir="/tmp/hf_adapter",
|
| 292 |
-
local_dir_use_symlinks=False,
|
| 293 |
-
allow_patterns=None,
|
| 294 |
-
)
|
| 295 |
_strip_bom_in_dir(adapter_dir)
|
| 296 |
_sanitize_adapter_config(adapter_dir)
|
| 297 |
|
|
@@ -301,7 +263,6 @@ def _load_model():
|
|
| 301 |
|
| 302 |
TOK = AutoTokenizer.from_pretrained(base_dir)
|
| 303 |
base = AutoModelForSeq2SeqLM.from_pretrained(base_dir)
|
| 304 |
-
|
| 305 |
MODEL = PeftModel.from_pretrained(base, adapter_dir) if adapter_dir else base
|
| 306 |
MODEL.eval().to(DEVICE)
|
| 307 |
|
|
@@ -339,10 +300,8 @@ def translate_with_model(text: str, max_new_tokens: int = 48) -> str:
|
|
| 339 |
return tok.decode(out[0], skip_special_tokens=True)
|
| 340 |
|
| 341 |
def _preload_thread():
|
| 342 |
-
try:
|
| 343 |
-
|
| 344 |
-
except Exception:
|
| 345 |
-
pass
|
| 346 |
|
| 347 |
if PRELOAD_MODEL:
|
| 348 |
threading.Thread(target=_preload_thread, daemon=True).start()
|
|
@@ -382,17 +341,6 @@ def diag():
|
|
| 382 |
"prenorm": {"level": PRENORM_LEVEL, "debug": PRENORM_DEBUG},
|
| 383 |
})
|
| 384 |
|
| 385 |
-
@app.get("/debug/session/set")
|
| 386 |
-
def dbg_set():
|
| 387 |
-
session.permanent = True
|
| 388 |
-
session["uid"] = "test-user"
|
| 389 |
-
session["email"] = "test@example.com"
|
| 390 |
-
return {"ok": True, "set": True}
|
| 391 |
-
|
| 392 |
-
@app.get("/debug/session/get")
|
| 393 |
-
def dbg_get():
|
| 394 |
-
return {"uid": session.get("uid"), "email": session.get("email")}
|
| 395 |
-
|
| 396 |
# ---------- Auth & Pages ----------
|
| 397 |
@app.get("/health")
|
| 398 |
@app.get("/ping")
|
|
@@ -401,6 +349,8 @@ def health():
|
|
| 401 |
|
| 402 |
@app.get("/login")
|
| 403 |
def login_get():
|
|
|
|
|
|
|
| 404 |
return render_template("login.html")
|
| 405 |
|
| 406 |
@app.post("/login")
|
|
@@ -424,6 +374,8 @@ def login_post():
|
|
| 424 |
|
| 425 |
@app.get("/register")
|
| 426 |
def register_get():
|
|
|
|
|
|
|
| 427 |
return render_template("register.html")
|
| 428 |
|
| 429 |
@app.post("/register")
|
|
@@ -440,10 +392,10 @@ def register_post():
|
|
| 440 |
return redirect(url_for("register_get"))
|
| 441 |
u = User(email=email); set_password(u, pwd)
|
| 442 |
s.add(u); s.commit()
|
| 443 |
-
session.permanent = True
|
| 444 |
-
session["uid"], session["email"] = u.id, u.email
|
| 445 |
|
| 446 |
-
|
|
|
|
|
|
|
| 447 |
|
| 448 |
@app.get("/logout")
|
| 449 |
def logout():
|
|
|
|
| 16 |
|
| 17 |
# ---------- Flask ----------
|
| 18 |
app = Flask(__name__, template_folder="frontend", static_folder="static")
|
|
|
|
| 19 |
app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1)
|
| 20 |
|
|
|
|
| 21 |
app.config.update(
|
| 22 |
SECRET_KEY=os.getenv("SECRET_KEY", "dev-secret-change-me"),
|
| 23 |
SESSION_COOKIE_NAME="hfspace_session",
|
|
|
|
| 93 |
return fn(*args, **kwargs)
|
| 94 |
return _wrap
|
| 95 |
|
| 96 |
+
# ---------- Prenorm ----------
|
|
|
|
|
|
|
|
|
|
| 97 |
PRENORM_LEVEL = os.getenv("PRENORM_LEVEL", "basic").lower()
|
| 98 |
PRENORM_DEBUG = os.getenv("PRENORM_DEBUG", "0") == "1"
|
| 99 |
|
| 100 |
WS_RE = re.compile(r"\s+")
|
| 101 |
+
ELONG_RE = re.compile(r"([bcdfghjklmnpqrstvwxyz])\1{2,}", flags=re.IGNORECASE)
|
| 102 |
PUNC_RE = re.compile(r"[^\w\s,.;:?!%()\-\β/]|_")
|
| 103 |
MULTI_PUNC = re.compile(r"([,.;:?!])\1+")
|
| 104 |
DASH_SPACES= re.compile(r"\s*([-β/])\s*")
|
|
|
|
| 117 |
"ma": "sama", "deng": "dengan", "dgn": "dengan",
|
| 118 |
"kira2": "kira-kira", "bgmn": "bagaimana", "gmn": "bagaimana",
|
| 119 |
}
|
|
|
|
| 120 |
PRON_MAP = {
|
| 121 |
"sa": "saya", "saya": "saya",
|
| 122 |
"ko": "kamu", "kamu": "kamu",
|
|
|
|
| 127 |
|
| 128 |
def _normalize_unicode(text: str) -> str:
|
| 129 |
return unicodedata.normalize("NFKC", text)
|
|
|
|
| 130 |
def _strip_emoji_and_noise(text: str) -> str:
|
| 131 |
text = PUNC_RE.sub(" ", text)
|
| 132 |
text = MULTI_PUNC.sub(r"\1", text)
|
| 133 |
text = DASH_SPACES.sub(r" \1 ", text)
|
| 134 |
return text
|
|
|
|
| 135 |
def _normalize_ws(text: str) -> str:
|
| 136 |
return WS_RE.sub(" ", text).strip()
|
|
|
|
| 137 |
def _reduce_elongation(token: str) -> str:
|
| 138 |
base = token.lower()
|
| 139 |
+
if base in WHITELIST_KEEP_ELONG: return token
|
|
|
|
| 140 |
return ELONG_RE.sub(r"\1\1", token)
|
|
|
|
| 141 |
def _apply_papua_map(token: str) -> str:
|
| 142 |
low = token.lower()
|
| 143 |
return PAPUA_MAP.get(low, token)
|
|
|
|
| 144 |
def _handle_pu_constructs(text: str) -> str:
|
|
|
|
| 145 |
def repl(m):
|
| 146 |
pron = m.group(1).lower()
|
| 147 |
rest = m.group(2).strip()
|
|
|
|
| 149 |
return f"punya {pron_std} {rest}"
|
| 150 |
return re.sub(r"\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s*pu\s+([^.,;:!?]+)",
|
| 151 |
repl, text, flags=re.IGNORECASE)
|
|
|
|
| 152 |
def _token_level_ops(text: str, aggressive: bool) -> str:
|
| 153 |
tokens = text.split()
|
| 154 |
out = []
|
|
|
|
| 159 |
return " ".join(out)
|
| 160 |
|
| 161 |
def papua_prenorm(inp: str, level: str = "basic", return_trace: bool = False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
trace = {"level": level}
|
| 163 |
if level == "off":
|
| 164 |
return (inp, trace) if return_trace else inp
|
|
|
|
| 165 |
s0 = inp
|
| 166 |
s1 = _normalize_unicode(s0)
|
| 167 |
s2 = _strip_emoji_and_noise(s1) if level == "aggressive" else s1
|
|
|
|
| 169 |
s4 = _handle_pu_constructs(s3)
|
| 170 |
s5 = _token_level_ops(s4, aggressive=(level == "aggressive"))
|
| 171 |
s6 = _normalize_ws(s5)
|
|
|
|
| 172 |
if return_trace:
|
| 173 |
+
trace.update({"original": s0, "unicode_norm": s1,
|
| 174 |
+
"strip_noise": s2 if level == "aggressive" else "(skip)",
|
| 175 |
+
"ws_norm_1": s3, "pu_constructs": s4,
|
| 176 |
+
"token_ops": s5, "final": s6})
|
|
|
|
|
|
|
| 177 |
return s6, trace
|
| 178 |
return s6
|
| 179 |
|
|
|
|
| 180 |
def prenorm(text: str) -> str:
|
| 181 |
if PRENORM_DEBUG:
|
| 182 |
out, tr = papua_prenorm(text, level=PRENORM_LEVEL, return_trace=True)
|
|
|
|
| 184 |
return out
|
| 185 |
return papua_prenorm(text, level=PRENORM_LEVEL, return_trace=False)
|
| 186 |
|
| 187 |
+
# ---------- Model (lazy) ----------
|
|
|
|
| 188 |
BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "amosnbn/cendol-mt5-base-inst")
|
| 189 |
ADAPTER_ID = os.getenv("ADAPTER_ID", "amosnbn/papua-lora-ckpt-168")
|
| 190 |
DEVICE = "cuda" if os.getenv("DEVICE", "cpu") == "cuda" else "cpu"
|
|
|
|
| 245 |
json.dump(cleaned, f, ensure_ascii=False, indent=2)
|
| 246 |
|
| 247 |
def _load_model():
|
|
|
|
| 248 |
global TOK, MODEL, _MODEL_READY, _MODEL_ERROR
|
| 249 |
try:
|
| 250 |
log.info("[MODEL] downloading base=%s adapter=%s", BASE_MODEL_ID, ADAPTER_ID or "-")
|
| 251 |
+
base_dir = snapshot_download(repo_id=BASE_MODEL_ID, local_dir="/tmp/hf_base", local_dir_use_symlinks=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
_strip_bom_in_dir(base_dir)
|
| 253 |
|
| 254 |
adapter_dir = None
|
| 255 |
if ADAPTER_ID:
|
| 256 |
+
adapter_dir = snapshot_download(repo_id=ADAPTER_ID, local_dir="/tmp/hf_adapter", local_dir_use_symlinks=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
_strip_bom_in_dir(adapter_dir)
|
| 258 |
_sanitize_adapter_config(adapter_dir)
|
| 259 |
|
|
|
|
| 263 |
|
| 264 |
TOK = AutoTokenizer.from_pretrained(base_dir)
|
| 265 |
base = AutoModelForSeq2SeqLM.from_pretrained(base_dir)
|
|
|
|
| 266 |
MODEL = PeftModel.from_pretrained(base, adapter_dir) if adapter_dir else base
|
| 267 |
MODEL.eval().to(DEVICE)
|
| 268 |
|
|
|
|
| 300 |
return tok.decode(out[0], skip_special_tokens=True)
|
| 301 |
|
| 302 |
def _preload_thread():
|
| 303 |
+
try: _load_model()
|
| 304 |
+
except Exception: pass
|
|
|
|
|
|
|
| 305 |
|
| 306 |
if PRELOAD_MODEL:
|
| 307 |
threading.Thread(target=_preload_thread, daemon=True).start()
|
|
|
|
| 341 |
"prenorm": {"level": PRENORM_LEVEL, "debug": PRENORM_DEBUG},
|
| 342 |
})
|
| 343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
# ---------- Auth & Pages ----------
|
| 345 |
@app.get("/health")
|
| 346 |
@app.get("/ping")
|
|
|
|
| 349 |
|
| 350 |
@app.get("/login")
|
| 351 |
def login_get():
|
| 352 |
+
if session.get("uid"): # sudah login β ke home
|
| 353 |
+
return redirect(url_for("index"))
|
| 354 |
return render_template("login.html")
|
| 355 |
|
| 356 |
@app.post("/login")
|
|
|
|
| 374 |
|
| 375 |
@app.get("/register")
|
| 376 |
def register_get():
|
| 377 |
+
if session.get("uid"): # sudah login β ke home
|
| 378 |
+
return redirect(url_for("index"))
|
| 379 |
return render_template("register.html")
|
| 380 |
|
| 381 |
@app.post("/register")
|
|
|
|
| 392 |
return redirect(url_for("register_get"))
|
| 393 |
u = User(email=email); set_password(u, pwd)
|
| 394 |
s.add(u); s.commit()
|
|
|
|
|
|
|
| 395 |
|
| 396 |
+
# BEDA DI SINI: tidak auto-login. Wajib login manual.
|
| 397 |
+
flash("Registrasi berhasil. Silakan login.", "success")
|
| 398 |
+
return redirect(url_for("login_get"))
|
| 399 |
|
| 400 |
@app.get("/logout")
|
| 401 |
def logout():
|