cert-study-app / ingest.py
Kentlo's picture
์ค‘๋ณต๋‹ต๋ณ€ ๊ฐ€๋Šฅ ๋ฒ„์ „
08348fb
# ==============================================
# ingest.py (v2025-UNIVERSAL-COMPAT)
# โœ… ํ˜„์žฌ db.py/init_db() (์ธ์ž ์—†์Œ)์™€ 100% ํ˜ธํ™˜
# โœ… ๊ธฐ์กด questions ํ…Œ์ด๋ธ” ์Šคํ‚ค๋งˆ( options_json / answer / pairs / sequence ) ๊ธฐ์ค€์œผ๋กœ ์•ˆ์ „ ๋™์ž‘
# โœ… ํ™•์žฅ ๋ชจ๋ธ( set_answer_keys / set_answer_steps / set_images / page_no ๋“ฑ )์ด ์žˆ์œผ๋ฉด ์ž๋™ ํ™œ์šฉ
# โœ… options๋Š” ํ•ญ์ƒ [{"key","text"}] ํ˜•ํƒœ๋กœ ์ €์žฅ โ†’ ์›น ํ‘œ๊ธฐ ์•ˆ์ •ํ™”
# โœ… Steps/๋ณต์ˆ˜/์ค‘๋ณต ์ •๋‹ต:
# - ํ™•์žฅ ๋ชจ๋ธ์ด๋ฉด answer_json/answer_steps_json์— ์ €์žฅ
# - ๊ตฌ๋ฒ„์ „์ด๋ฉด answer="A,B" / sequence=["E","B","C"] ๋กœ fallback
# โœ… rebuild_db=True: DB ํŒŒ์ผ ์‚ญ์ œ ํ›„ ์žฌ์ƒ์„ฑ
# ==============================================
import json
import os
from typing import Any, Dict, List
from db import SessionLocal, init_db, DB_PATH
from models import Question
# -----------------------------
# Helpers
# -----------------------------
def _to_list_answer_keys(v: Any) -> List[str]:
"""
์ •๋‹ต ์ž…๋ ฅ์„ key ๋ฆฌ์ŠคํŠธ๋กœ ์ •๊ทœํ™”.
- ["A","C"] -> ["A","C"]
- "A" -> ["A"]
- "A,C" -> ["A","C"]
- "BE" -> ["B","E"] (์ „๋ถ€ ๋Œ€๋ฌธ์ž ์•ŒํŒŒ๋ฒณ์ผ ๋•Œ๋งŒ)
- dict(steps ํ…์ŠคํŠธ) -> [] (steps์—์„œ ์ฒ˜๋ฆฌ)
"""
if v is None:
return []
if isinstance(v, list):
return [str(x).strip() for x in v if str(x).strip()]
if isinstance(v, dict):
return []
s = str(v).strip()
if not s:
return []
if "," in s:
return [x.strip() for x in s.split(",") if x.strip()]
if len(s) >= 2 and s.isalpha() and s.upper() == s:
return list(s)
return [s]
def _normalize_options(opts: Any) -> List[Dict[str, str]]:
"""
options๋ฅผ ํ•ญ์ƒ ํ‘œ์ค€ ๋ฆฌ์ŠคํŠธ ํ˜•ํƒœ๋กœ:
[{"key":"A","text":"..."}, ...]
์ง€์›:
- list[str]
- dict{key:text}
- list[dict] (์ด๋ฏธ key/text)
"""
if not opts:
return []
# list[dict]
if isinstance(opts, list) and opts and all(isinstance(x, dict) for x in opts):
out = []
for o in opts:
k = str(o.get("key", "")).strip()
t = str(o.get("text", "")).strip()
if k or t:
out.append({"key": k, "text": t})
return out
# list[str]
if isinstance(opts, list):
return [{"key": chr(65 + i), "text": str(opt).strip()} for i, opt in enumerate(opts)]
# dict{key:text}
if isinstance(opts, dict):
return [{"key": str(k).strip(), "text": str(v).strip()} for k, v in opts.items()]
return []
def _infer_steps_answer_keys(item: Dict[str, Any], options_std: List[Dict[str, str]]) -> List[str]:
"""
Steps ์ •๋‹ต์„ key ๋ฆฌ์ŠคํŠธ๋กœ ๋ฝ‘๋Š”๋‹ค.
์šฐ์„ ์ˆœ์œ„:
1) answer_steps(list)
2) sequence(list)
3) answer๊ฐ€ {"1":"ํ…์ŠคํŠธ", "2":"ํ…์ŠคํŠธ"} ํ˜•ํƒœ๋ฉด options text ๋งค์นญ์œผ๋กœ key ์ถ”์ •
"""
if isinstance(item.get("answer_steps"), list):
return [str(x).strip() for x in item["answer_steps"] if str(x).strip()]
if isinstance(item.get("sequence"), list):
return [str(x).strip() for x in item["sequence"] if str(x).strip()]
ans = item.get("answer")
if isinstance(ans, dict) and all(str(k).isdigit() for k in ans.keys()):
text_to_key = {}
for o in options_std:
t = (o.get("text") or "").strip()
if t and t not in text_to_key:
text_to_key[t] = (o.get("key") or "").strip()
keys = []
for i in sorted(int(x) for x in ans.keys()):
t = str(ans.get(str(i), "")).strip()
keys.append(text_to_key.get(t, "__UNKNOWN__"))
return keys
return []
def _load_json(json_path: str) -> List[Dict[str, Any]]:
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)
# Case Study: {"questions":[...]}
if isinstance(data, dict) and "questions" in data:
data = data["questions"]
if not isinstance(data, list):
raise ValueError("JSON root must be a list (or dict with 'questions').")
# dict ์•„๋‹Œ ๊ฒƒ ์ œ๊ฑฐ
return [x for x in data if isinstance(x, dict)]
def _normalize_item(item: Dict[str, Any]) -> Dict[str, Any]:
# stem
stem = (item.get("stem") or item.get("question") or item.get("q_text") or "")
stem = str(stem).strip()
explanation = str(item.get("explanation") or "").strip()
qtype = item.get("question_type", "MCQ")
code = item.get("code", "")
category = item.get("category") or item.get("topic") or None
subcategory = item.get("subcategory") or item.get("subtopic") or None
# options ํ‘œ์ค€ํ™”
options_std = _normalize_options(item.get("options"))
# ์ •๋ ฌํ‚ค (์žˆ์œผ๋ฉด)
source_pages = item.get("source_pages")
page_no = source_pages[0] if isinstance(source_pages, list) and source_pages else None
page_legacy = item.get("page")
q_no_on_page = item.get("q_no_on_page")
global_no = item.get("global_no") or item.get("question_id")
# ์ด๋ฏธ์ง€
images = item.get("images") or item.get("image_urls") or []
if not isinstance(images, list):
images = []
# steps ์ •๋‹ต
answer_steps = _infer_steps_answer_keys(item, options_std)
# ์ผ๋ฐ˜ ์ •๋‹ต key๋“ค
answer_keys = []
if not answer_steps:
if isinstance(item.get("answer_keys"), list):
answer_keys = [str(x).strip() for x in item["answer_keys"] if str(x).strip()]
else:
answer_keys = _to_list_answer_keys(item.get("answer"))
return {
"stem": stem,
"explanation": explanation,
"question_type": qtype,
"category": category,
"subcategory": subcategory,
"code": code,
"options_std": options_std,
"page": page_legacy,
"page_no": page_no,
"q_no_on_page": q_no_on_page,
"global_no": global_no,
"answer_keys": answer_keys,
"answer_steps": answer_steps,
# ๋ ˆ๊ฑฐ์‹œ ์œ ์ง€
"pairs": item.get("pairs"),
"sequence": item.get("sequence"),
"images": images,
"raw_answer": item.get("answer"),
}
# -----------------------------
# Ingest
# -----------------------------
def ingest_questions(json_path: str, source_name: str = "imported", rebuild_db: bool = False) -> int:
"""
โœ… ํ˜„์žฌ db.py/init_db()์™€ ํ˜ธํ™˜๋˜๋Š” ํ†ตํ•ฉ ingest
- rebuild_db=True: DB ํŒŒ์ผ ์‚ญ์ œ ํ›„ init_db()๋กœ ์ƒˆ๋กœ ์ƒ์„ฑ
"""
json_path = str(json_path)
if rebuild_db and DB_PATH.exists():
DB_PATH.unlink()
print(f"[INFO] ๐Ÿงน Deleted DB: {DB_PATH}")
# โœ… ํ˜„์žฌ db.py๋Š” ์ธ์ž ์—†๋Š” init_db()๋งŒ ์ง€์›
init_db()
rows = _load_json(json_path)
db = SessionLocal()
try:
count = 0
for raw in rows:
qn = _normalize_item(raw)
q = Question(
page=qn["page"],
stem=qn["stem"],
explanation=qn["explanation"],
question_type=qn["question_type"],
category=qn["category"],
subcategory=qn["subcategory"],
source=source_name,
code=qn["code"],
)
# (ํ™•์žฅ ๋ชจ๋ธ์ด๋ฉด) ์ •๋ ฌํ‚ค ์ €์žฅ
if hasattr(q, "page_no"):
q.page_no = qn["page_no"]
if hasattr(q, "q_no_on_page"):
q.q_no_on_page = qn["q_no_on_page"]
if hasattr(q, "global_no"):
q.global_no = qn["global_no"]
# โœ… options๋Š” ํ‘œ์ค€ list[dict]๋กœ ์ €์žฅ (web ํ‘œ์‹œ ์•ˆ์ •ํ™”)
q.set_options(qn["options_std"])
# โœ… Steps ์ •๋‹ต ์ฒ˜๋ฆฌ
if qn["answer_steps"]:
if hasattr(q, "set_answer_steps"):
q.set_answer_steps(qn["answer_steps"])
q.answer = "" # ํ™•์žฅ ์ปฌ๋Ÿผ ์“ฐ๋Š” ๊ฒฝ์šฐ ๋ ˆ๊ฑฐ์‹œ ๋น„์›Œ๋„ OK
else:
# ๊ตฌ๋ฒ„์ „ fallback: sequence์— steps key ๋ฆฌ์ŠคํŠธ ์ €์žฅ
q.sequence = json.dumps(qn["answer_steps"], ensure_ascii=False)
q.answer = "" # steps๋Š” answer ๋ฌธ์ž์—ด ๋น„๊ต๊ฐ€ ์˜๋ฏธ ์—†์Œ
else:
# โœ… ์ผ๋ฐ˜ ์ •๋‹ต(๋ณต์ˆ˜/์ค‘๋ณต ํฌํ•จ)
if qn["answer_keys"]:
if hasattr(q, "set_answer_keys"):
q.set_answer_keys(qn["answer_keys"])
q.answer = ""
else:
# ๊ตฌ๋ฒ„์ „ fallback: answer="A,B,C" (์ค‘๋ณต๋„ ๊ทธ๋Œ€๋กœ)
q.answer = ",".join(qn["answer_keys"])
else:
# ์ •๋‹ต์ด ์• ๋งคํ•˜๋ฉด ์›๋ณธ ์œ ์ง€
q.answer = str(qn["raw_answer"] or "").strip()
# ๋ ˆ๊ฑฐ์‹œ pairs/sequence ์œ ์ง€(์žˆ์œผ๋ฉด)
if qn["sequence"] is not None:
q.sequence = json.dumps(qn["sequence"], ensure_ascii=False) if isinstance(qn["sequence"], list) else qn["sequence"]
if qn["pairs"] is not None:
q.pairs = json.dumps(qn["pairs"], ensure_ascii=False) if isinstance(qn["pairs"], (dict, list)) else qn["pairs"]
# โœ… images ์ €์žฅ(ํ™•์žฅ ๋ชจ๋ธ์ด๋ฉด)
if hasattr(q, "set_images"):
q.set_images(qn["images"])
db.add(q)
count += 1
db.commit()
print(f"[INFO] โœ… {count} ๋ฌธํ•ญ DB ์ ์žฌ ์™„๋ฃŒ ({source_name})")
return count
except Exception as e:
db.rollback()
print(f"[ERROR] DB ์ ์žฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ โ†’ {e}")
raise
finally:
db.close()
if __name__ == "__main__":
# ๋„ˆ๊ฐ€ ๋งํ•œ ์‹ค์ œ ๊ฒฝ๋กœ: data/questions.json
path = os.getenv("QUESTIONS_JSON", "data/questions.json")
source = os.getenv("SOURCE_NAME", "az104_dump")
rebuild = os.getenv("REBUILD_DB", "0") == "1"
ingest_questions(path, source_name=source, rebuild_db=rebuild)