Spaces:

Corin1998
/

HFResumeIntakeSystem_DC

Sleeping

App Files Files Community

Corin1998 commited on 21 days ago

Commit

fb38314

verified ·

1 Parent(s): 8344d24

Update pipelines/openai_ingest.py

Browse files

Files changed (1) hide show

pipelines/openai_ingest.py +67 -38

pipelines/openai_ingest.py CHANGED Viewed

@@ -24,60 +24,84 @@ def _img_to_base64(img: Image.Image) -> str:
     return base64.b64encode(buf.getvalue()).decode("utf-8")
-def _pdf_to_images(pdf_bytes: bytes, dpi: int = 220, max_pages: int = 10) -> List[Image.Image]:
     pages = convert_from_bytes(pdf_bytes, dpi=dpi)
     return pages[:max_pages]
 def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
     client = _client_lazy()
-    images: List[Image.Image] = []
-    if filetype == "pdf":
-        images = _pdf_to_images(payload)
-    elif filetype == "image":
-        images = [Image.open(io.BytesIO(payload)).convert("RGB")]
-    else:  # txt/docxから来たテキストbytes
         text = payload.decode("utf-8", errors="ignore")
-        prompt = "以下は履歴書/職務経歴書の本文です。レイアウトノイズを除去し、見出しや箇条書きを維持しつつ読みやすいテキストに整形して返してください。"
-        resp = client.responses.create(
             model=MODEL_TEXT,
-            input=[
-                {"role": "system", "content": "You are a meticulous document cleaner for Japanese resumes."},
-                {"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]},
             ],
         )
-        return resp.output_text
-    content = [
-        {"type": "input_text", "text": "日本語の履歴書/職務経歴書の画像です。OCRして本文を日本語テキストで忠実に返してください。"}
-    ]
-    for img in images:
-        content.append({"type": "input_image", "image_data": _img_to_base64(img)})
-    resp = client.responses.create(model=MODEL_VISION, input=[{"role": "user", "content": content}])
-    return resp.output_text
 def structure_with_openai(text: str) -> dict:
     client = _client_lazy()
     sys = (
-        "あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、JSONで返してください。"
-        " JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。"
-        " skills_list は重複除去済み配列。work_experience_raw等は原文抜粋で良い。"
     )
     user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
-    resp = client.responses.create(
         model=MODEL_TEXT,
-        input=[
-            {"role": "system", "content": [{"type": "input_text", "text": sys}]},
-            {"role": "user", "content": [{"type": "input_text", "text": user}]},
         ],
         response_format={"type": "json_object"},
     )
     import json as _json
     try:
-        data = _json.loads(resp.output_text)
     except Exception:
         data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
     for k in ("work_experience_raw", "education_raw", "certifications_raw"):
@@ -88,17 +112,22 @@ def structure_with_openai(text: str) -> dict:
 def summarize_with_openai(text: str) -> dict:
     client = _client_lazy()
-    prompt = "以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。"
-    resp = client.responses.create(
         model=MODEL_TEXT,
-        input=[
-            {"role": "system", "content": [{"type": "input_text", "text": "You write crisp Japanese executive summaries."}]},
-            {"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]},
-        ],
     )
-    full = resp.output_text
     return {
-        "300chars": full[:600] if full else "",
-        "100chars": full[:120] if full else "",
-        "onesent": (full.split("。")[0] + "。") if ("。" in full) else full,
     }

     return base64.b64encode(buf.getvalue()).decode("utf-8")
+def _pdf_to_images(pdf_bytes: bytes, dpi: int = 200, max_pages: int = 8) -> List[Image.Image]:
     pages = convert_from_bytes(pdf_bytes, dpi=dpi)
     return pages[:max_pages]
 def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
+    """
+    画像/PDF: 画像化して Vision (chat.completions) へ。
+    txt/docx: テキスト整形だけ実施（安定・低コスト）。
+    """
     client = _client_lazy()
+    # テキストの場合はそのまま整形
+    if filetype not in {"pdf", "image"}:
         text = payload.decode("utf-8", errors="ignore")
+        sys = "You clean up Japanese resumes, preserving headings and bullet structure and removing layout noise."
+        user = (
+            "以下の本文を、見出し・箇条書きを保ちつつ整形してください。不要な罫線/番号/改ページは除去：\n\n" + text
+        )
+        resp = client.chat.completions.create(
             model=MODEL_TEXT,
+            messages=[
+                {"role": "system", "content": sys},
+                {"role": "user", "content": user},
             ],
+            temperature=0.2,
         )
+        return resp.choices[0].message.content.strip()
+    # 画像/PDF → 画像列へ
+    if filetype == "pdf":
+        images = _pdf_to_images(payload)
+    else:
+        images = [Image.open(io.BytesIO(payload)).convert("RGB")]
+    vision_msgs = [
+        {"role": "system", "content": "You are an accurate Japanese OCR assistant for resumes."},
+        {"role": "user", "content": [
+            {
+                "type": "text",
+                "text": "日本語の履歴書/職務経歴書画像です。OCRして本文を日本語テキストで忠実に返してください。"
+            },
+            *[
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{_img_to_base64(img)}"}
+                } for img in images
+            ]
+        ]},
+    ]
+    resp = client.chat.completions.create(
+        model=MODEL_VISION,
+        messages=vision_msgs,
+        temperature=0.0,
+    )
+    return resp.choices[0].message.content.strip()
 def structure_with_openai(text: str) -> dict:
     client = _client_lazy()
     sys = (
+        "あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、"
+        "JSONで返してください。JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。"
+        "skills_list は重複除去済み配列。各 *_raw は原文抜粋で構いません。"
     )
     user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
+    resp = client.chat.completions.create(
         model=MODEL_TEXT,
+        messages=[
+            {"role": "system", "content": sys},
+            {"role": "user", "content": user},
         ],
+        temperature=0.2,
         response_format={"type": "json_object"},
     )
     import json as _json
     try:
+        data = _json.loads(resp.choices[0].message.content)
     except Exception:
         data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
     for k in ("work_experience_raw", "education_raw", "certifications_raw"):
 def summarize_with_openai(text: str) -> dict:
     client = _client_lazy()
+    sys = "You write crisp, factual Japanese executive summaries."
+    user = (
+        "以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。"
+        "不要な記号は避け、事実を簡潔に述べてください。\n\n" + text
+    )
+    resp = client.chat.completions.create(
         model=MODEL_TEXT,
+        messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}],
+        temperature=0.2,
     )
+    full = resp.choices[0].message.content.strip()
+    # ルールベース簡易抽出（フォーマット崩れでも破綻しない）
+    one_sent = full.split("。")[0] + "。" if "。" in full else full
     return {
+        "300chars": full[:600],   # だいたい300字相当（マージン確保）
+        "100chars": full[:120],
+        "onesent": one_sent,
     }