| | import streamlit as st |
| | import pandas as pd |
| | import io, zipfile, re, html, json |
| | from typing import Dict, Tuple |
| |
|
| | st.set_page_config(page_title="๐ฆ ๋ณด์กด์์ ๋ผ๋ฒจ ์์ฑ๊ธฐ", layout="wide") |
| | st.title("๐ฆ ๋ณด์กด์์ ๋ผ๋ฒจ ์์ฑ๊ธฐ ๐ฆ") |
| |
|
| | |
| | def _year_range(series: pd.Series) -> str: |
| | s = series.astype(str).fillna("") |
| | v = s[~s.isin(["", "0", "0000"])] |
| | if v.empty: return "0000-0000" |
| | nums = pd.to_numeric(v, errors="coerce").dropna().astype(int) |
| | if nums.empty: return "0000-0000" |
| | return f"{nums.min():04d}-{nums.max():04d}" |
| |
|
| | def build_rows(df: pd.DataFrame) -> pd.DataFrame: |
| | df = df.copy() |
| | df["๋ฐ์ค๋ฒํธ"] = df["๋ฐ์ค๋ฒํธ"].astype(str).str.zfill(4) |
| | if "์ ๋ชฉ" in df.columns: |
| | df["์ ๋ชฉ"] = df["์ ๋ชฉ"].astype(str) |
| |
|
| | |
| | if "์ข
๋ฃ์ฐ๋" in df.columns: |
| | yr = df.groupby("๋ฐ์ค๋ฒํธ")["์ข
๋ฃ์ฐ๋"].apply(_year_range).reset_index() |
| | yr.columns = ["๋ฐ์ค๋ฒํธ", "์์ฐ์ฐ๋"] |
| | else: |
| | yr = pd.DataFrame({"๋ฐ์ค๋ฒํธ": df["๋ฐ์ค๋ฒํธ"].unique(), "์์ฐ์ฐ๋": "0000-0000"}) |
| |
|
| | |
| | has_mgmt = "๊ด๋ฆฌ๋ฒํธ" in df.columns |
| | lists = [] |
| | for b, g in df.groupby("๋ฐ์ค๋ฒํธ"): |
| | lines = [f"- {r['๊ด๋ฆฌ๋ฒํธ']} {r.get('์ ๋ชฉ','')}" if has_mgmt else f"- {r.get('์ ๋ชฉ','')}" |
| | for _, r in g.iterrows()] |
| | lists.append({"๋ฐ์ค๋ฒํธ": b, "๋ชฉ๋ก": "\r\n".join(lines)}) |
| | list_df = pd.DataFrame(lists) |
| |
|
| | |
| | meta_cols = ["๋ฐ์ค๋ฒํธ","์ข
๋ฃ์ฐ๋","๋ณด์กด๊ธฐ๊ฐ","๋จ์์
๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","์ ๋ชฉ"] |
| | meta_exist = [c for c in meta_cols if c in df.columns] |
| | meta = df.groupby("๋ฐ์ค๋ฒํธ", as_index=False).first()[meta_exist] if meta_exist \ |
| | else pd.DataFrame({"๋ฐ์ค๋ฒํธ": df["๋ฐ์ค๋ฒํธ"].unique()}) |
| |
|
| | merged = meta.merge(list_df, on="๋ฐ์ค๋ฒํธ", how="left").merge(yr, on="๋ฐ์ค๋ฒํธ", how="left") |
| | return merged |
| |
|
| | |
| | FIELD_PAIR_RE_TMPL = ( |
| | r'<(?P<fprefix>[a-zA-Z0-9_]+):fieldBegin\b[^>]*\bname="{name}"[^>]*/>' |
| | r'(.*?)' |
| | r'<(?P=fprefix):fieldEnd\b[^>]*/>' |
| | ) |
| | TOKEN_FMT = "{{{{{key}}}}}" |
| |
|
| | |
| | PARA_RE = re.compile( |
| | r'<(?P<pprefix>[a-zA-Z0-9_]+):p(?P<pattrs>[^>]*)>(?P<pbody>.*?)</(?P=pprefix):p>', |
| | re.DOTALL |
| | ) |
| |
|
| | |
| | def _extract_run_style(body: str, pprefix: str) -> str: |
| | """๋ฌธ๋จ ๋ด์ฉ์์ ์ฒซ ๋ฒ์งธ run ์์์ ์คํ์ผ์ ์ถ์ถ""" |
| | run_pattern = re.compile( |
| | rf'<{pprefix}:run[^>]*>.*?</{pprefix}:run>', |
| | re.DOTALL |
| | ) |
| | match = run_pattern.search(body) |
| | if match: |
| | return match.group(0) |
| | return f'<{pprefix}:run><{pprefix}:t><//{pprefix}:t></{pprefix}:run>' |
| |
|
| | |
| | def _make_para_with_style(pprefix: str, pattrs: str, text: str, original_run: str) -> str: |
| | esc = html.escape("" if text is None else str(text)) |
| | |
| | |
| | text_pattern = re.compile(rf'(<{pprefix}:t[^>]*>)[^<]*(</{pprefix}:t>)') |
| | new_run = text_pattern.sub(rf'\g<1>{esc}\g<2>', original_run) |
| | |
| | |
| | if new_run == original_run: |
| | t_pattern = re.compile(rf'(<{pprefix}:run[^>]*>)(.*?)(</{pprefix}:run>)', re.DOTALL) |
| | new_run = t_pattern.sub(rf'\g<1><{pprefix}:t>{esc}</{pprefix}:t>\g<3>', original_run) |
| | |
| | return f'<{pprefix}:p{pattrs}>{new_run}</{pprefix}:p>' |
| |
|
| | def _split_lines(val) -> list: |
| | if val is None: return [""] |
| | return str(val).replace("\r\n","\n").split("\n") |
| |
|
| | def _replace_para_multiline(xml: str, key: str, value: str, dbg: dict) -> str: |
| | """ |
| | key๊ฐ ํฌํจ๋ '๋ถ๋ชจ ๋ฌธ๋จ ์ ์ฒด'๋ฅผ, ๊ฐ์ ๊ฐ ์ค์ ๋ด์ ์ฌ๋ฌ ๋ฌธ๋จ์ผ๋ก ๊ต์ฒด. |
| | ์๋ณธ ์คํ์ผ์ ์ ์งํ๋ฉด์ ๊ต์ฒด. |
| | """ |
| | pair_pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(key)), re.DOTALL) |
| | tnode_pat = re.compile(rf'<(?P<p>[a-zA-Z0-9_]+):t[^>]*>[^<]*{re.escape(key)}[^<]*</(?P=p):t>', re.DOTALL) |
| | token_str = TOKEN_FMT.format(key=key) |
| |
|
| | def para_repl(m): |
| | body = m.group("pbody") |
| | if not (pair_pat.search(body) or tnode_pat.search(body) or (token_str in body)): |
| | return m.group(0) |
| |
|
| | lines = _split_lines(value) |
| | pprefix = m.group("pprefix") |
| | pattrs = m.group("pattrs") |
| | |
| | |
| | original_run = _extract_run_style(body, pprefix) |
| | |
| | |
| | new_paras = "".join(_make_para_with_style(pprefix, pattrs, ln, original_run) for ln in lines) |
| | dbg["para_hits"][key] = dbg["para_hits"].get(key, 0) + 1 |
| | return new_paras |
| |
|
| | xml2 = PARA_RE.sub(para_repl, xml) |
| | if xml2 != xml: |
| | dbg["touched"] = True |
| | return xml2 |
| |
|
| | def _runs_plain(text: str) -> str: |
| | return f"<hp:run><hp:t>{html.escape('' if text is None else str(text))}</hp:t></hp:run>" |
| |
|
| | def _apply_to_xml(xml: str, mapping: Dict[str, str], dbg: dict) -> str: |
| | changed_any = False |
| |
|
| | |
| | multi_key = re.compile(r"^(๋ชฉ๋ก|list|์ ๋ชฉ)\d+$", re.IGNORECASE) |
| | for k, v in mapping.items(): |
| | if multi_key.match(k): |
| | xml_new = _replace_para_multiline(xml, k, v, dbg) |
| | if xml_new != xml: |
| | xml = xml_new |
| | changed_any = True |
| |
|
| | |
| | for k, v in mapping.items(): |
| | if multi_key.match(k): |
| | continue |
| | replacement = _runs_plain(v) |
| | pat = re.compile(FIELD_PAIR_RE_TMPL.format(name=re.escape(k)), re.DOTALL) |
| | xml_new, n = pat.subn(replacement, xml) |
| | if n: |
| | dbg["field_hits"][k] = dbg["field_hits"].get(k, 0) + n |
| | xml = xml_new |
| | changed_any = True |
| |
|
| | |
| | tnode_all = re.compile( |
| | r'(<(?P<prefix>[a-zA-Z0-9_]+):t[^>]*>)([^<]*?)</(?P=prefix):t>', |
| | re.DOTALL |
| | ) |
| | for k, v in mapping.items(): |
| | if multi_key.match(k): |
| | continue |
| | def repl_tnode(m): |
| | text_node = m.group(3) |
| | if k not in text_node: |
| | return m.group(0) |
| | new_text = html.escape(text_node.replace(k, "" if v is None else str(v))) |
| | return f"{m.group(1)}{new_text}</{m.group('prefix')}:t>" |
| | xml2 = tnode_all.sub(repl_tnode, xml) |
| | if xml2 != xml: |
| | dbg["text_hits"][k] = dbg["text_hits"].get(k, 0) + 1 |
| | xml = xml2 |
| | changed_any = True |
| |
|
| | |
| | for k, v in mapping.items(): |
| | if multi_key.match(k): |
| | continue |
| | tok = TOKEN_FMT.format(key=k) |
| | if tok in xml: |
| | xml = xml.replace(tok, html.escape("" if v is None else str(v))) |
| | dbg["token_hits"][k] = dbg["token_hits"].get(k, 0) + 1 |
| | changed_any = True |
| |
|
| | if changed_any: |
| | dbg["files_touched"] = True |
| | return xml |
| |
|
| | def replace_in_hwpx(hwpx_bytes: bytes, mapping: Dict[str,str]) -> Tuple[bytes, dict]: |
| | import stat, time |
| | dbg = {"para_hits":{}, "field_hits":{}, "text_hits":{}, "token_hits":{}, "touched_files": []} |
| | zin = zipfile.ZipFile(io.BytesIO(hwpx_bytes), "r") |
| | out_buf = io.BytesIO() |
| | zout = zipfile.ZipFile(out_buf, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) |
| |
|
| | |
| | now = time.localtime() |
| |
|
| | |
| | names = zin.namelist() |
| | if "mimetype" in names: |
| | zi = zipfile.ZipInfo("mimetype") |
| | zi.compress_type = zipfile.ZIP_STORED |
| | |
| | zi.external_attr = 0o100666 << 16 |
| | zi.create_system = 0 |
| | zi.date_time = now[:6] |
| | zout.writestr(zi, zin.read("mimetype")) |
| |
|
| | for e in zin.infolist(): |
| | if e.filename == "mimetype": |
| | continue |
| | data = zin.read(e.filename) |
| | if e.filename.lower().endswith(".xml"): |
| | try: |
| | s = data.decode("utf-8", errors="ignore") |
| | before = s |
| | s = _apply_to_xml(s, mapping, {"para_hits":dbg["para_hits"], "field_hits":dbg["field_hits"], |
| | "text_hits":dbg["text_hits"], "token_hits":dbg["token_hits"], |
| | "files_touched":False}) |
| | if s != before: |
| | dbg["touched_files"].append(e.filename) |
| | data = s.encode("utf-8") |
| | except Exception: |
| | pass |
| | |
| | |
| | zi = zipfile.ZipInfo(e.filename) |
| | zi.compress_type = zipfile.ZIP_DEFLATED |
| | zi.external_attr = 0o100666 << 16 |
| | zi.create_system = 0 |
| | zi.date_time = now[:6] |
| | zi.flag_bits = 0 |
| | zout.writestr(zi, data) |
| |
|
| | zout.close() |
| | out_buf.seek(0) |
| | zin.close() |
| | return out_buf.getvalue(), dbg |
| |
|
| | |
| | with st.expander("์ฌ์ฉ๋ฒ", expanded=True): |
| | st.markdown(""" |
| | 1. ํ
ํ๋ฆฟ ํ์ผ์ ์
๋ก๋ํด์ฃผ์ธ์. |
| | 2. ๋ณด์กด์์ ์ ๋ณด๊ฐ ๋ค์ด์๋ ์์
ํ์ผ์ ์
๋ก๋ํด์ฃผ์ธ์. |
| | 3. ์ถ๋ ฅํ ๋ผ๋ฒจ ๋ฒํธ๋ฅผ ์ ํํด์ฃผ์ธ์. |
| | 4. ์์ถ ํ์ผ์ ๋ค์ด๋ฐ๊ณ , ์์ถํด์ ํ ํ์ผ์ ๋ณํฉํด์ฃผ์ธ์. |
| | 5. ๋ณํฉ ํ, ๋ผ๋ฒจ์ ์ถ๋ ฅํ์ธ์. |
| | ๋จ, ํ
ํ๋ฆฟ์ .HWPX(ํ๊ธ) ํ์ผ์ด์ด์ผ ํฉ๋๋ค. (.HWP ๋ถ๊ฐ) |
| | """) |
| |
|
| | tpl = st.file_uploader("๐ HWPX ํ
ํ๋ฆฟ ์
๋ก๋", type=["hwpx"]) |
| | n_per_page = st.number_input("ํ
ํ๋ฆฟ์ ๋ผ๋ฒจ ์ธํธ ๊ฐ์(ํ ํ์ด์ง N๊ฐ)", 1, 12, 3, 1) |
| | data = st.file_uploader("๐ ๋ฐ์ดํฐ ์
๋ก๋ (Excel/CSV)", type=["xlsx","xls","csv"]) |
| |
|
| | if tpl and data: |
| | tpl_bytes = tpl.read() |
| | df = pd.read_csv(data) if data.name.lower().endswith(".csv") else pd.read_excel(data) |
| |
|
| | if "๋ฐ์ค๋ฒํธ" not in df.columns: |
| | st.error("โ ํ์ ์ปฌ๋ผ '๋ฐ์ค๋ฒํธ'๊ฐ ์์ต๋๋ค."); st.stop() |
| |
|
| | st.success("โ
์์น ๋งคํ ์๋ฃ (์์
์ธก)") |
| | st.dataframe(df.head(10), use_container_width=True) |
| |
|
| | merged = build_rows(df) |
| | boxes = merged["๋ฐ์ค๋ฒํธ"].astype(str).str.zfill(4).unique().tolist() |
| |
|
| | st.subheader("๐ ์
๋ก๋๋ ๋ฐ์ค๋ฒํธ ๋ชฉ๋ก") |
| | st.write(f"์ด **{len(boxes)}**๊ฐ") |
| | st.dataframe(pd.DataFrame({"๋ฐ์ค๋ฒํธ": boxes}), use_container_width=True, height=240) |
| |
|
| | sel = st.multiselect("์์ฑํ ๋ฐ์ค๋ฒํธ ์ ํ (๋น์ฐ๋ฉด ์ ์ฒด)", options=boxes) |
| | work = merged[merged["๋ฐ์ค๋ฒํธ"].isin(sel)] if sel else merged |
| | records = work.sort_values("๋ฐ์ค๋ฒํธ").to_dict(orient="records") |
| |
|
| | |
| | st.subheader("๐งช 1ํ์ด์ง ๋งคํ ํ๋ฆฌ๋ทฐ") |
| | keys = ["๋ฐ์ค๋ฒํธ","์ข
๋ฃ์ฐ๋","๋ณด์กด๊ธฐ๊ฐ","๋จ์์
๋ฌด","๊ธฐ๋ก๋ฌผ์ฒ ","๋ชฉ๋ก","์ ๋ชฉ","์
๋ฌด๋ช
"] |
| | mapping_preview = {} |
| | for i in range(int(n_per_page)): |
| | if i < len(records): |
| | r = records[i] |
| | mapping_preview.update({ |
| | f"๋ฐ์ค๋ฒํธ{i+1}": r.get("๋ฐ์ค๋ฒํธ",""), |
| | f"์ข
๋ฃ์ฐ๋{i+1}": r.get("์์ฐ์ฐ๋",""), |
| | f"๋ณด์กด๊ธฐ๊ฐ{i+1}": r.get("๋ณด์กด๊ธฐ๊ฐ",""), |
| | f"๋จ์์
๋ฌด{i+1}": r.get("๋จ์์
๋ฌด",""), |
| | f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}": r.get("๊ธฐ๋ก๋ฌผ์ฒ ",""), |
| | f"๋ชฉ๋ก{i+1}": r.get("๋ชฉ๋ก",""), |
| | f"์ ๋ชฉ{i+1}": r.get("์ ๋ชฉ",""), |
| | f"์
๋ฌด๋ช
{i+1}": r.get("์ ๋ชฉ",""), |
| | }) |
| | else: |
| | for k in keys: mapping_preview[f"{k}{i+1}"] = "" |
| | st.dataframe(pd.DataFrame([{"ํค":k, "๊ฐ ์๋ถ๋ถ":str(v)[:120]} for k,v in sorted(mapping_preview.items())]), |
| | use_container_width=True, height=320) |
| |
|
| | if st.button("๐ ๋ผ๋ฒจ ์์ฑ (ํ์ด์ง๋ณ HWPX ZIP)"): |
| | mem = io.BytesIO(); zout = zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED) |
| | pages = (len(records) + int(n_per_page) - 1) // int(n_per_page) |
| | debug_all = [] |
| |
|
| | for p in range(pages): |
| | chunk = records[p*int(n_per_page):(p+1)*int(n_per_page)] |
| | mapping = {} |
| | for i in range(int(n_per_page)): |
| | if i < len(chunk): |
| | r = chunk[i] |
| | mapping[f"๋ฐ์ค๋ฒํธ{i+1}"] = r.get("๋ฐ์ค๋ฒํธ","") |
| | mapping[f"์ข
๋ฃ์ฐ๋{i+1}"] = r.get("์์ฐ์ฐ๋","") |
| | mapping[f"๋ณด์กด๊ธฐ๊ฐ{i+1}"] = r.get("๋ณด์กด๊ธฐ๊ฐ","") |
| | mapping[f"๋จ์์
๋ฌด{i+1}"] = r.get("๋จ์์
๋ฌด","") |
| | mapping[f"๊ธฐ๋ก๋ฌผ์ฒ {i+1}"] = r.get("๊ธฐ๋ก๋ฌผ์ฒ ","") |
| | mapping[f"๋ชฉ๋ก{i+1}"] = r.get("๋ชฉ๋ก","") |
| | title_val = r.get("์ ๋ชฉ","") |
| | mapping[f"์ ๋ชฉ{i+1}"] = title_val |
| | mapping[f"์
๋ฌด๋ช
{i+1}"] = title_val |
| | else: |
| | for k in keys: mapping[f"{k}{i+1}"] = "" |
| |
|
| | out_hwpx, dbg = replace_in_hwpx(tpl_bytes, mapping) |
| | debug_all.append({"page": p+1, "stats": dbg}) |
| | name = "_".join([r.get("๋ฐ์ค๋ฒํธ","") for r in chunk]) if chunk else f"empty_{p+1}" |
| | zout.writestr(f"label_{name}.hwpx", out_hwpx) |
| |
|
| | zout.close(); mem.seek(0) |
| | st.download_button("โฌ๏ธ ZIP ๋ค์ด๋ก๋", data=mem, file_name="labels_by_page.zip", mime="application/zip") |
| | st.download_button("โฌ๏ธ ๋๋ฒ๊ทธ(JSON)", data=json.dumps(debug_all, ensure_ascii=False, indent=2), |
| | file_name="debug.json", mime="application/json") |