Corin1998 commited on
Commit
0b2df0d
·
verified ·
1 Parent(s): 6104937

Update pipelines/anonymize.py

Browse files
Files changed (1) hide show
  1. pipelines/anonymize.py +43 -55
pipelines/anonymize.py CHANGED
@@ -1,75 +1,63 @@
1
  import re
2
- import io
3
- from typing import Dict, Tuple
4
- from reportlab.pdfgen import canvas
5
  from reportlab.lib.pagesizes import A4
6
- from reportlab.pdfbase import pdfmetrics
7
- from reportlab.pdfbase.ttfonts import TTFont
8
-
9
- _MASK = "■" * 4 # マスク文字
10
 
11
- _EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
12
- _PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
13
- _NAME_HINTS = ["氏名", "Name", "名前"]
 
14
 
15
- def _mask_name_lines(text: str) -> Tuple[str, Dict[str, str]]:
16
  mapping = {}
17
- lines = text.splitlines()
18
- for i, line in enumerate(lines):
19
- if any(h in line for h in _NAME_HINTS):
20
- mapping[f"LINE_{i}"] = line
21
- lines[i] = re.sub(r".+", _MASK, line)
22
- return "\n".join(lines), mapping
23
 
24
- def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
25
- """メール/電話/氏名行などをマスク。マッピングを返す。"""
26
- mapping: Dict[str, str] = {}
 
 
 
 
 
 
 
27
 
28
- def _apply(pat: re.Pattern, t: str, key_prefix: str) -> str:
29
- nonlocal mapping
30
- def _rep(m):
31
- original = m.group(0)
32
- mapping[f"{key_prefix}:{original}"] = original
33
- return _MASK
34
- return pat.sub(_rep, t)
35
 
36
- out = text
37
- out = _apply(_EMAIL_RE, out, "EMAIL")
38
- out = _apply(_PHONE_RE, out, "PHONE")
39
- out, name_map = _mask_name_lines(out)
40
- mapping.update(name_map)
41
- return out, mapping
42
 
43
  def render_anonymized_pdf(text: str) -> bytes:
44
- """
45
- シンプルなテキストPDFを生成。
46
- 日本語フォントがある場合は `fonts/NotoSansCJKjp-Regular.otf` を使用。
47
- 無い場合はHelvetica(□が出る可能性あり)。
48
- """
49
  buf = io.BytesIO()
50
  c = canvas.Canvas(buf, pagesize=A4)
51
  width, height = A4
52
 
53
- # フォント登録(任意)
54
- try:
55
- pdfmetrics.registerFont(TTFont("NotoSansJP", "fonts/NotoSansCJKjp-Regular.otf"))
56
- font_name = "NotoSansJP"
57
- except Exception:
58
- font_name = "Helvetica"
59
-
60
- c.setFont(font_name, 10)
61
-
62
- margin = 40
63
- y = height - margin
64
- line_height = 14
65
 
66
  for line in text.splitlines():
67
- if y < margin + line_height:
68
  c.showPage()
69
- c.setFont(font_name, 10)
70
- y = height - margin
71
- c.drawString(margin, y, line[:1800])
72
- y -= line_height
73
 
74
  c.showPage()
75
  c.save()
 
1
  import re
2
+ from typing import Tuple, Dict
 
 
3
  from reportlab.lib.pagesizes import A4
4
+ from reportlab.pdfgen import canvas
5
+ from reportlab.lib.units import mm
6
+ import io
 
7
 
8
+ # 簡易匿名化(メール・電話・氏名行のマスク)
9
+ EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
10
+ PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
11
+ NAME_LINE_RE = re.compile(r"^(?:氏名|Name)[::]?\s*([^\n\r]+)$", re.MULTILINE)
12
 
13
+ def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
14
  mapping = {}
15
+ # メール
16
+ def _mask_email(m):
17
+ key = f"EMAIL_{len(mapping)+1}"
18
+ mapping[m.group(0)] = key
19
+ return key + "@masked"
20
+ text = EMAIL_RE.sub(_mask_email, text)
21
 
22
+ # 電話
23
+ def _mask_phone(m):
24
+ ph = m.group(0)
25
+ # 短すぎるノイズはスキップ
26
+ if len(re.sub(r"\D", "", ph)) < 7:
27
+ return ph
28
+ key = f"TEL_{len(mapping)+1}"
29
+ mapping[ph] = key
30
+ return key
31
+ text = PHONE_RE.sub(_mask_phone, text)
32
 
33
+ # 氏名行
34
+ def _mask_name(m):
35
+ val = m.group(1).strip()
36
+ key = f"NAME_{len(mapping)+1}"
37
+ mapping[val] = key
38
+ return m.group(0).replace(val, key)
39
+ text = NAME_LINE_RE.sub(_mask_name, text)
40
 
41
+ return text, mapping
 
 
 
 
 
42
 
43
  def render_anonymized_pdf(text: str) -> bytes:
 
 
 
 
 
44
  buf = io.BytesIO()
45
  c = canvas.Canvas(buf, pagesize=A4)
46
  width, height = A4
47
 
48
+ # 素朴なテキスト描画(自動改頁)
49
+ margin_x = 15 * mm
50
+ margin_y = 15 * mm
51
+ y = height - margin_y
52
+ c.setFont("Helvetica", 10)
 
 
 
 
 
 
 
53
 
54
  for line in text.splitlines():
55
+ if y < margin_y:
56
  c.showPage()
57
+ c.setFont("Helvetica", 10)
58
+ y = height - margin_y
59
+ c.drawString(margin_x, y, line[:180]) # 超長行は素直に切る
60
+ y -= 12
61
 
62
  c.showPage()
63
  c.save()