Spaces:
Running
Running
| # ============================================== | |
| # ingest.py (v2025-UNIVERSAL-COMPAT) | |
| # โ ํ์ฌ db.py/init_db() (์ธ์ ์์)์ 100% ํธํ | |
| # โ ๊ธฐ์กด questions ํ ์ด๋ธ ์คํค๋ง( options_json / answer / pairs / sequence ) ๊ธฐ์ค์ผ๋ก ์์ ๋์ | |
| # โ ํ์ฅ ๋ชจ๋ธ( set_answer_keys / set_answer_steps / set_images / page_no ๋ฑ )์ด ์์ผ๋ฉด ์๋ ํ์ฉ | |
| # โ options๋ ํญ์ [{"key","text"}] ํํ๋ก ์ ์ฅ โ ์น ํ๊ธฐ ์์ ํ | |
| # โ Steps/๋ณต์/์ค๋ณต ์ ๋ต: | |
| # - ํ์ฅ ๋ชจ๋ธ์ด๋ฉด answer_json/answer_steps_json์ ์ ์ฅ | |
| # - ๊ตฌ๋ฒ์ ์ด๋ฉด answer="A,B" / sequence=["E","B","C"] ๋ก fallback | |
| # โ rebuild_db=True: DB ํ์ผ ์ญ์ ํ ์ฌ์์ฑ | |
| # ============================================== | |
| import json | |
| import os | |
| from typing import Any, Dict, List | |
| from db import SessionLocal, init_db, DB_PATH | |
| from models import Question | |
| # ----------------------------- | |
| # Helpers | |
| # ----------------------------- | |
| def _to_list_answer_keys(v: Any) -> List[str]: | |
| """ | |
| ์ ๋ต ์ ๋ ฅ์ key ๋ฆฌ์คํธ๋ก ์ ๊ทํ. | |
| - ["A","C"] -> ["A","C"] | |
| - "A" -> ["A"] | |
| - "A,C" -> ["A","C"] | |
| - "BE" -> ["B","E"] (์ ๋ถ ๋๋ฌธ์ ์ํ๋ฒณ์ผ ๋๋ง) | |
| - dict(steps ํ ์คํธ) -> [] (steps์์ ์ฒ๋ฆฌ) | |
| """ | |
| if v is None: | |
| return [] | |
| if isinstance(v, list): | |
| return [str(x).strip() for x in v if str(x).strip()] | |
| if isinstance(v, dict): | |
| return [] | |
| s = str(v).strip() | |
| if not s: | |
| return [] | |
| if "," in s: | |
| return [x.strip() for x in s.split(",") if x.strip()] | |
| if len(s) >= 2 and s.isalpha() and s.upper() == s: | |
| return list(s) | |
| return [s] | |
| def _normalize_options(opts: Any) -> List[Dict[str, str]]: | |
| """ | |
| options๋ฅผ ํญ์ ํ์ค ๋ฆฌ์คํธ ํํ๋ก: | |
| [{"key":"A","text":"..."}, ...] | |
| ์ง์: | |
| - list[str] | |
| - dict{key:text} | |
| - list[dict] (์ด๋ฏธ key/text) | |
| """ | |
| if not opts: | |
| return [] | |
| # list[dict] | |
| if isinstance(opts, list) and opts and all(isinstance(x, dict) for x in opts): | |
| out = [] | |
| for o in opts: | |
| k = str(o.get("key", "")).strip() | |
| t = str(o.get("text", "")).strip() | |
| if k or t: | |
| out.append({"key": k, "text": t}) | |
| return out | |
| # list[str] | |
| if isinstance(opts, list): | |
| return [{"key": chr(65 + i), "text": str(opt).strip()} for i, opt in enumerate(opts)] | |
| # dict{key:text} | |
| if isinstance(opts, dict): | |
| return [{"key": str(k).strip(), "text": str(v).strip()} for k, v in opts.items()] | |
| return [] | |
| def _infer_steps_answer_keys(item: Dict[str, Any], options_std: List[Dict[str, str]]) -> List[str]: | |
| """ | |
| Steps ์ ๋ต์ key ๋ฆฌ์คํธ๋ก ๋ฝ๋๋ค. | |
| ์ฐ์ ์์: | |
| 1) answer_steps(list) | |
| 2) sequence(list) | |
| 3) answer๊ฐ {"1":"ํ ์คํธ", "2":"ํ ์คํธ"} ํํ๋ฉด options text ๋งค์นญ์ผ๋ก key ์ถ์ | |
| """ | |
| if isinstance(item.get("answer_steps"), list): | |
| return [str(x).strip() for x in item["answer_steps"] if str(x).strip()] | |
| if isinstance(item.get("sequence"), list): | |
| return [str(x).strip() for x in item["sequence"] if str(x).strip()] | |
| ans = item.get("answer") | |
| if isinstance(ans, dict) and all(str(k).isdigit() for k in ans.keys()): | |
| text_to_key = {} | |
| for o in options_std: | |
| t = (o.get("text") or "").strip() | |
| if t and t not in text_to_key: | |
| text_to_key[t] = (o.get("key") or "").strip() | |
| keys = [] | |
| for i in sorted(int(x) for x in ans.keys()): | |
| t = str(ans.get(str(i), "")).strip() | |
| keys.append(text_to_key.get(t, "__UNKNOWN__")) | |
| return keys | |
| return [] | |
| def _load_json(json_path: str) -> List[Dict[str, Any]]: | |
| with open(json_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| # Case Study: {"questions":[...]} | |
| if isinstance(data, dict) and "questions" in data: | |
| data = data["questions"] | |
| if not isinstance(data, list): | |
| raise ValueError("JSON root must be a list (or dict with 'questions').") | |
| # dict ์๋ ๊ฒ ์ ๊ฑฐ | |
| return [x for x in data if isinstance(x, dict)] | |
| def _normalize_item(item: Dict[str, Any]) -> Dict[str, Any]: | |
| # stem | |
| stem = (item.get("stem") or item.get("question") or item.get("q_text") or "") | |
| stem = str(stem).strip() | |
| explanation = str(item.get("explanation") or "").strip() | |
| qtype = item.get("question_type", "MCQ") | |
| code = item.get("code", "") | |
| category = item.get("category") or item.get("topic") or None | |
| subcategory = item.get("subcategory") or item.get("subtopic") or None | |
| # options ํ์คํ | |
| options_std = _normalize_options(item.get("options")) | |
| # ์ ๋ ฌํค (์์ผ๋ฉด) | |
| source_pages = item.get("source_pages") | |
| page_no = source_pages[0] if isinstance(source_pages, list) and source_pages else None | |
| page_legacy = item.get("page") | |
| q_no_on_page = item.get("q_no_on_page") | |
| global_no = item.get("global_no") or item.get("question_id") | |
| # ์ด๋ฏธ์ง | |
| images = item.get("images") or item.get("image_urls") or [] | |
| if not isinstance(images, list): | |
| images = [] | |
| # steps ์ ๋ต | |
| answer_steps = _infer_steps_answer_keys(item, options_std) | |
| # ์ผ๋ฐ ์ ๋ต key๋ค | |
| answer_keys = [] | |
| if not answer_steps: | |
| if isinstance(item.get("answer_keys"), list): | |
| answer_keys = [str(x).strip() for x in item["answer_keys"] if str(x).strip()] | |
| else: | |
| answer_keys = _to_list_answer_keys(item.get("answer")) | |
| return { | |
| "stem": stem, | |
| "explanation": explanation, | |
| "question_type": qtype, | |
| "category": category, | |
| "subcategory": subcategory, | |
| "code": code, | |
| "options_std": options_std, | |
| "page": page_legacy, | |
| "page_no": page_no, | |
| "q_no_on_page": q_no_on_page, | |
| "global_no": global_no, | |
| "answer_keys": answer_keys, | |
| "answer_steps": answer_steps, | |
| # ๋ ๊ฑฐ์ ์ ์ง | |
| "pairs": item.get("pairs"), | |
| "sequence": item.get("sequence"), | |
| "images": images, | |
| "raw_answer": item.get("answer"), | |
| } | |
| # ----------------------------- | |
| # Ingest | |
| # ----------------------------- | |
| def ingest_questions(json_path: str, source_name: str = "imported", rebuild_db: bool = False) -> int: | |
| """ | |
| โ ํ์ฌ db.py/init_db()์ ํธํ๋๋ ํตํฉ ingest | |
| - rebuild_db=True: DB ํ์ผ ์ญ์ ํ init_db()๋ก ์๋ก ์์ฑ | |
| """ | |
| json_path = str(json_path) | |
| if rebuild_db and DB_PATH.exists(): | |
| DB_PATH.unlink() | |
| print(f"[INFO] ๐งน Deleted DB: {DB_PATH}") | |
| # โ ํ์ฌ db.py๋ ์ธ์ ์๋ init_db()๋ง ์ง์ | |
| init_db() | |
| rows = _load_json(json_path) | |
| db = SessionLocal() | |
| try: | |
| count = 0 | |
| for raw in rows: | |
| qn = _normalize_item(raw) | |
| q = Question( | |
| page=qn["page"], | |
| stem=qn["stem"], | |
| explanation=qn["explanation"], | |
| question_type=qn["question_type"], | |
| category=qn["category"], | |
| subcategory=qn["subcategory"], | |
| source=source_name, | |
| code=qn["code"], | |
| ) | |
| # (ํ์ฅ ๋ชจ๋ธ์ด๋ฉด) ์ ๋ ฌํค ์ ์ฅ | |
| if hasattr(q, "page_no"): | |
| q.page_no = qn["page_no"] | |
| if hasattr(q, "q_no_on_page"): | |
| q.q_no_on_page = qn["q_no_on_page"] | |
| if hasattr(q, "global_no"): | |
| q.global_no = qn["global_no"] | |
| # โ options๋ ํ์ค list[dict]๋ก ์ ์ฅ (web ํ์ ์์ ํ) | |
| q.set_options(qn["options_std"]) | |
| # โ Steps ์ ๋ต ์ฒ๋ฆฌ | |
| if qn["answer_steps"]: | |
| if hasattr(q, "set_answer_steps"): | |
| q.set_answer_steps(qn["answer_steps"]) | |
| q.answer = "" # ํ์ฅ ์ปฌ๋ผ ์ฐ๋ ๊ฒฝ์ฐ ๋ ๊ฑฐ์ ๋น์๋ OK | |
| else: | |
| # ๊ตฌ๋ฒ์ fallback: sequence์ steps key ๋ฆฌ์คํธ ์ ์ฅ | |
| q.sequence = json.dumps(qn["answer_steps"], ensure_ascii=False) | |
| q.answer = "" # steps๋ answer ๋ฌธ์์ด ๋น๊ต๊ฐ ์๋ฏธ ์์ | |
| else: | |
| # โ ์ผ๋ฐ ์ ๋ต(๋ณต์/์ค๋ณต ํฌํจ) | |
| if qn["answer_keys"]: | |
| if hasattr(q, "set_answer_keys"): | |
| q.set_answer_keys(qn["answer_keys"]) | |
| q.answer = "" | |
| else: | |
| # ๊ตฌ๋ฒ์ fallback: answer="A,B,C" (์ค๋ณต๋ ๊ทธ๋๋ก) | |
| q.answer = ",".join(qn["answer_keys"]) | |
| else: | |
| # ์ ๋ต์ด ์ ๋งคํ๋ฉด ์๋ณธ ์ ์ง | |
| q.answer = str(qn["raw_answer"] or "").strip() | |
| # ๋ ๊ฑฐ์ pairs/sequence ์ ์ง(์์ผ๋ฉด) | |
| if qn["sequence"] is not None: | |
| q.sequence = json.dumps(qn["sequence"], ensure_ascii=False) if isinstance(qn["sequence"], list) else qn["sequence"] | |
| if qn["pairs"] is not None: | |
| q.pairs = json.dumps(qn["pairs"], ensure_ascii=False) if isinstance(qn["pairs"], (dict, list)) else qn["pairs"] | |
| # โ images ์ ์ฅ(ํ์ฅ ๋ชจ๋ธ์ด๋ฉด) | |
| if hasattr(q, "set_images"): | |
| q.set_images(qn["images"]) | |
| db.add(q) | |
| count += 1 | |
| db.commit() | |
| print(f"[INFO] โ {count} ๋ฌธํญ DB ์ ์ฌ ์๋ฃ ({source_name})") | |
| return count | |
| except Exception as e: | |
| db.rollback() | |
| print(f"[ERROR] DB ์ ์ฌ ์ค ์ค๋ฅ ๋ฐ์ โ {e}") | |
| raise | |
| finally: | |
| db.close() | |
| if __name__ == "__main__": | |
| # ๋๊ฐ ๋งํ ์ค์ ๊ฒฝ๋ก: data/questions.json | |
| path = os.getenv("QUESTIONS_JSON", "data/questions.json") | |
| source = os.getenv("SOURCE_NAME", "az104_dump") | |
| rebuild = os.getenv("REBUILD_DB", "0") == "1" | |
| ingest_questions(path, source_name=source, rebuild_db=rebuild) | |