Corin1998 commited on
Commit
fb38314
·
verified ·
1 Parent(s): 8344d24

Update pipelines/openai_ingest.py

Browse files
Files changed (1) hide show
  1. pipelines/openai_ingest.py +67 -38
pipelines/openai_ingest.py CHANGED
@@ -24,60 +24,84 @@ def _img_to_base64(img: Image.Image) -> str:
24
  return base64.b64encode(buf.getvalue()).decode("utf-8")
25
 
26
 
27
- def _pdf_to_images(pdf_bytes: bytes, dpi: int = 220, max_pages: int = 10) -> List[Image.Image]:
28
  pages = convert_from_bytes(pdf_bytes, dpi=dpi)
29
  return pages[:max_pages]
30
 
31
 
32
  def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
 
 
 
 
33
  client = _client_lazy()
34
 
35
- images: List[Image.Image] = []
36
- if filetype == "pdf":
37
- images = _pdf_to_images(payload)
38
- elif filetype == "image":
39
- images = [Image.open(io.BytesIO(payload)).convert("RGB")]
40
- else: # txt/docxから来たテキストbytes
41
  text = payload.decode("utf-8", errors="ignore")
42
- prompt = "以下は履歴書/職務経歴書の本文です。レイアウトノイズを除去し、見出しや箇条書きを維持しつつ読みやすいテキストに整形して返してください。"
43
- resp = client.responses.create(
 
 
 
44
  model=MODEL_TEXT,
45
- input=[
46
- {"role": "system", "content": "You are a meticulous document cleaner for Japanese resumes."},
47
- {"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]},
48
  ],
 
49
  )
50
- return resp.output_text
51
 
52
- content = [
53
- {"type": "input_text", "text": "日本語の履歴書/職務経歴書の画像です。OCRして本文を日本語テキストで忠実に返してください。"}
54
- ]
55
- for img in images:
56
- content.append({"type": "input_image", "image_data": _img_to_base64(img)})
57
 
58
- resp = client.responses.create(model=MODEL_VISION, input=[{"role": "user", "content": content}])
59
- return resp.output_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
  def structure_with_openai(text: str) -> dict:
63
  client = _client_lazy()
64
  sys = (
65
- "あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、JSONで返してください。"
66
- " JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。"
67
- " skills_list は重複除去済み配列。work_experience_raw等は原文抜粋で良い。"
68
  )
69
  user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
70
- resp = client.responses.create(
71
  model=MODEL_TEXT,
72
- input=[
73
- {"role": "system", "content": [{"type": "input_text", "text": sys}]},
74
- {"role": "user", "content": [{"type": "input_text", "text": user}]},
75
  ],
 
76
  response_format={"type": "json_object"},
77
  )
78
  import json as _json
79
  try:
80
- data = _json.loads(resp.output_text)
81
  except Exception:
82
  data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
83
  for k in ("work_experience_raw", "education_raw", "certifications_raw"):
@@ -88,17 +112,22 @@ def structure_with_openai(text: str) -> dict:
88
 
89
  def summarize_with_openai(text: str) -> dict:
90
  client = _client_lazy()
91
- prompt = "以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。"
92
- resp = client.responses.create(
 
 
 
 
93
  model=MODEL_TEXT,
94
- input=[
95
- {"role": "system", "content": [{"type": "input_text", "text": "You write crisp Japanese executive summaries."}]},
96
- {"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]},
97
- ],
98
  )
99
- full = resp.output_text
 
 
 
100
  return {
101
- "300chars": full[:600] if full else "",
102
- "100chars": full[:120] if full else "",
103
- "onesent": (full.split("。")[0] + "。") if ("。" in full) else full,
104
  }
 
24
  return base64.b64encode(buf.getvalue()).decode("utf-8")
25
 
26
 
27
+ def _pdf_to_images(pdf_bytes: bytes, dpi: int = 200, max_pages: int = 8) -> List[Image.Image]:
28
  pages = convert_from_bytes(pdf_bytes, dpi=dpi)
29
  return pages[:max_pages]
30
 
31
 
32
  def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
33
+ """
34
+ 画像/PDF: 画像化して Vision (chat.completions) へ。
35
+ txt/docx: テキスト整形だけ実施(安定・低コスト)。
36
+ """
37
  client = _client_lazy()
38
 
39
+ # テキストの場合はそのまま整形
40
+ if filetype not in {"pdf", "image"}:
 
 
 
 
41
  text = payload.decode("utf-8", errors="ignore")
42
+ sys = "You clean up Japanese resumes, preserving headings and bullet structure and removing layout noise."
43
+ user = (
44
+ "以下の本文を、見出し・箇条書きを保ちつつ整形してください。不要な罫線/番号/改ページは除去:\n\n" + text
45
+ )
46
+ resp = client.chat.completions.create(
47
  model=MODEL_TEXT,
48
+ messages=[
49
+ {"role": "system", "content": sys},
50
+ {"role": "user", "content": user},
51
  ],
52
+ temperature=0.2,
53
  )
54
+ return resp.choices[0].message.content.strip()
55
 
56
+ # 画像/PDF → 画像列へ
57
+ if filetype == "pdf":
58
+ images = _pdf_to_images(payload)
59
+ else:
60
+ images = [Image.open(io.BytesIO(payload)).convert("RGB")]
61
 
62
+ vision_msgs = [
63
+ {"role": "system", "content": "You are an accurate Japanese OCR assistant for resumes."},
64
+ {"role": "user", "content": [
65
+ {
66
+ "type": "text",
67
+ "text": "日本語の履歴書/職務経歴書画像です。OCRして本文を日本語テキストで忠実に返してください。"
68
+ },
69
+ *[
70
+ {
71
+ "type": "image_url",
72
+ "image_url": {"url": f"data:image/png;base64,{_img_to_base64(img)}"}
73
+ } for img in images
74
+ ]
75
+ ]},
76
+ ]
77
+ resp = client.chat.completions.create(
78
+ model=MODEL_VISION,
79
+ messages=vision_msgs,
80
+ temperature=0.0,
81
+ )
82
+ return resp.choices[0].message.content.strip()
83
 
84
 
85
  def structure_with_openai(text: str) -> dict:
86
  client = _client_lazy()
87
  sys = (
88
+ "あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、"
89
+ "JSONで返してください。JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。"
90
+ "skills_list は重複除去済み配列。各 *_raw は原文抜粋で構いません。"
91
  )
92
  user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
93
+ resp = client.chat.completions.create(
94
  model=MODEL_TEXT,
95
+ messages=[
96
+ {"role": "system", "content": sys},
97
+ {"role": "user", "content": user},
98
  ],
99
+ temperature=0.2,
100
  response_format={"type": "json_object"},
101
  )
102
  import json as _json
103
  try:
104
+ data = _json.loads(resp.choices[0].message.content)
105
  except Exception:
106
  data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
107
  for k in ("work_experience_raw", "education_raw", "certifications_raw"):
 
112
 
113
  def summarize_with_openai(text: str) -> dict:
114
  client = _client_lazy()
115
+ sys = "You write crisp, factual Japanese executive summaries."
116
+ user = (
117
+ "以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。"
118
+ "不要な記号は避け、事実を簡潔に述べてください。\n\n" + text
119
+ )
120
+ resp = client.chat.completions.create(
121
  model=MODEL_TEXT,
122
+ messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}],
123
+ temperature=0.2,
 
 
124
  )
125
+ full = resp.choices[0].message.content.strip()
126
+
127
+ # ルールベース簡易抽出(フォーマット崩れでも破綻しない)
128
+ one_sent = full.split("。")[0] + "。" if "。" in full else full
129
  return {
130
+ "300chars": full[:600], # だいたい300字相当(マージン確保)
131
+ "100chars": full[:120],
132
+ "onesent": one_sent,
133
  }