Spaces:
Sleeping
Sleeping
Update pipelines/openai_ingest.py
Browse files- pipelines/openai_ingest.py +67 -38
pipelines/openai_ingest.py
CHANGED
|
@@ -24,60 +24,84 @@ def _img_to_base64(img: Image.Image) -> str:
|
|
| 24 |
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
| 25 |
|
| 26 |
|
| 27 |
-
def _pdf_to_images(pdf_bytes: bytes, dpi: int =
|
| 28 |
pages = convert_from_bytes(pdf_bytes, dpi=dpi)
|
| 29 |
return pages[:max_pages]
|
| 30 |
|
| 31 |
|
| 32 |
def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
client = _client_lazy()
|
| 34 |
|
| 35 |
-
|
| 36 |
-
if filetype
|
| 37 |
-
images = _pdf_to_images(payload)
|
| 38 |
-
elif filetype == "image":
|
| 39 |
-
images = [Image.open(io.BytesIO(payload)).convert("RGB")]
|
| 40 |
-
else: # txt/docxから来たテキストbytes
|
| 41 |
text = payload.decode("utf-8", errors="ignore")
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
| 44 |
model=MODEL_TEXT,
|
| 45 |
-
|
| 46 |
-
{"role": "system", "content":
|
| 47 |
-
{"role": "user", "content":
|
| 48 |
],
|
|
|
|
| 49 |
)
|
| 50 |
-
return resp.
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
def structure_with_openai(text: str) -> dict:
|
| 63 |
client = _client_lazy()
|
| 64 |
sys = (
|
| 65 |
-
"あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、
|
| 66 |
-
"
|
| 67 |
-
"
|
| 68 |
)
|
| 69 |
user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
|
| 70 |
-
resp = client.
|
| 71 |
model=MODEL_TEXT,
|
| 72 |
-
|
| 73 |
-
{"role": "system", "content":
|
| 74 |
-
{"role": "user", "content":
|
| 75 |
],
|
|
|
|
| 76 |
response_format={"type": "json_object"},
|
| 77 |
)
|
| 78 |
import json as _json
|
| 79 |
try:
|
| 80 |
-
data = _json.loads(resp.
|
| 81 |
except Exception:
|
| 82 |
data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
|
| 83 |
for k in ("work_experience_raw", "education_raw", "certifications_raw"):
|
|
@@ -88,17 +112,22 @@ def structure_with_openai(text: str) -> dict:
|
|
| 88 |
|
| 89 |
def summarize_with_openai(text: str) -> dict:
|
| 90 |
client = _client_lazy()
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
model=MODEL_TEXT,
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
{"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]},
|
| 97 |
-
],
|
| 98 |
)
|
| 99 |
-
full = resp.
|
|
|
|
|
|
|
|
|
|
| 100 |
return {
|
| 101 |
-
"300chars": full[:600]
|
| 102 |
-
"100chars": full[:120]
|
| 103 |
-
"onesent":
|
| 104 |
}
|
|
|
|
| 24 |
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
| 25 |
|
| 26 |
|
| 27 |
+
def _pdf_to_images(pdf_bytes: bytes, dpi: int = 200, max_pages: int = 8) -> List[Image.Image]:
|
| 28 |
pages = convert_from_bytes(pdf_bytes, dpi=dpi)
|
| 29 |
return pages[:max_pages]
|
| 30 |
|
| 31 |
|
| 32 |
def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
|
| 33 |
+
"""
|
| 34 |
+
画像/PDF: 画像化して Vision (chat.completions) へ。
|
| 35 |
+
txt/docx: テキスト整形だけ実施(安定・低コスト)。
|
| 36 |
+
"""
|
| 37 |
client = _client_lazy()
|
| 38 |
|
| 39 |
+
# テキストの場合はそのまま整形
|
| 40 |
+
if filetype not in {"pdf", "image"}:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
text = payload.decode("utf-8", errors="ignore")
|
| 42 |
+
sys = "You clean up Japanese resumes, preserving headings and bullet structure and removing layout noise."
|
| 43 |
+
user = (
|
| 44 |
+
"以下の本文を、見出し・箇条書きを保ちつつ整形してください。不要な罫線/番号/改ページは除去:\n\n" + text
|
| 45 |
+
)
|
| 46 |
+
resp = client.chat.completions.create(
|
| 47 |
model=MODEL_TEXT,
|
| 48 |
+
messages=[
|
| 49 |
+
{"role": "system", "content": sys},
|
| 50 |
+
{"role": "user", "content": user},
|
| 51 |
],
|
| 52 |
+
temperature=0.2,
|
| 53 |
)
|
| 54 |
+
return resp.choices[0].message.content.strip()
|
| 55 |
|
| 56 |
+
# 画像/PDF → 画像列へ
|
| 57 |
+
if filetype == "pdf":
|
| 58 |
+
images = _pdf_to_images(payload)
|
| 59 |
+
else:
|
| 60 |
+
images = [Image.open(io.BytesIO(payload)).convert("RGB")]
|
| 61 |
|
| 62 |
+
vision_msgs = [
|
| 63 |
+
{"role": "system", "content": "You are an accurate Japanese OCR assistant for resumes."},
|
| 64 |
+
{"role": "user", "content": [
|
| 65 |
+
{
|
| 66 |
+
"type": "text",
|
| 67 |
+
"text": "日本語の履歴書/職務経歴書画像です。OCRして本文を日本語テキストで忠実に返してください。"
|
| 68 |
+
},
|
| 69 |
+
*[
|
| 70 |
+
{
|
| 71 |
+
"type": "image_url",
|
| 72 |
+
"image_url": {"url": f"data:image/png;base64,{_img_to_base64(img)}"}
|
| 73 |
+
} for img in images
|
| 74 |
+
]
|
| 75 |
+
]},
|
| 76 |
+
]
|
| 77 |
+
resp = client.chat.completions.create(
|
| 78 |
+
model=MODEL_VISION,
|
| 79 |
+
messages=vision_msgs,
|
| 80 |
+
temperature=0.0,
|
| 81 |
+
)
|
| 82 |
+
return resp.choices[0].message.content.strip()
|
| 83 |
|
| 84 |
|
| 85 |
def structure_with_openai(text: str) -> dict:
|
| 86 |
client = _client_lazy()
|
| 87 |
sys = (
|
| 88 |
+
"あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、"
|
| 89 |
+
"JSONで返してください。JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。"
|
| 90 |
+
"skills_list は重複除去済み配列。各 *_raw は原文抜粋で構いません。"
|
| 91 |
)
|
| 92 |
user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
|
| 93 |
+
resp = client.chat.completions.create(
|
| 94 |
model=MODEL_TEXT,
|
| 95 |
+
messages=[
|
| 96 |
+
{"role": "system", "content": sys},
|
| 97 |
+
{"role": "user", "content": user},
|
| 98 |
],
|
| 99 |
+
temperature=0.2,
|
| 100 |
response_format={"type": "json_object"},
|
| 101 |
)
|
| 102 |
import json as _json
|
| 103 |
try:
|
| 104 |
+
data = _json.loads(resp.choices[0].message.content)
|
| 105 |
except Exception:
|
| 106 |
data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
|
| 107 |
for k in ("work_experience_raw", "education_raw", "certifications_raw"):
|
|
|
|
| 112 |
|
| 113 |
def summarize_with_openai(text: str) -> dict:
|
| 114 |
client = _client_lazy()
|
| 115 |
+
sys = "You write crisp, factual Japanese executive summaries."
|
| 116 |
+
user = (
|
| 117 |
+
"以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。"
|
| 118 |
+
"不要な記号は避け、事実を簡潔に述べてください。\n\n" + text
|
| 119 |
+
)
|
| 120 |
+
resp = client.chat.completions.create(
|
| 121 |
model=MODEL_TEXT,
|
| 122 |
+
messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}],
|
| 123 |
+
temperature=0.2,
|
|
|
|
|
|
|
| 124 |
)
|
| 125 |
+
full = resp.choices[0].message.content.strip()
|
| 126 |
+
|
| 127 |
+
# ルールベース簡易抽出(フォーマット崩れでも破綻しない)
|
| 128 |
+
one_sent = full.split("。")[0] + "。" if "。" in full else full
|
| 129 |
return {
|
| 130 |
+
"300chars": full[:600], # だいたい300字相当(マージン確保)
|
| 131 |
+
"100chars": full[:120],
|
| 132 |
+
"onesent": one_sent,
|
| 133 |
}
|