Shami96 commited on
Commit
7a2fc08
·
verified ·
1 Parent(s): 24ad2d2

Update extract_red_text.py

Browse files
Files changed (1) hide show
  1. extract_red_text.py +284 -197
extract_red_text.py CHANGED
@@ -1,8 +1,8 @@
1
  #!/usr/bin/env python3
2
  """
3
  extract_red_text.py
4
- Improved version that reuses hf_utils for shared heuristics while preserving
5
- the original schema logic, logging and behavior.
6
  """
7
 
8
  import re
@@ -11,26 +11,62 @@ import sys
11
  from docx import Document
12
  from docx.oxml.ns import qn
13
 
14
- # master schema & patterns (unchanged)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
16
 
17
- # canonical helpers (from your new hf_utils.py)
18
- from hf_utils import (
19
- is_red_font,
20
- normalize_text,
21
- normalize_header_text,
22
- flatten_json,
23
- find_matching_json_key_and_value,
24
- get_clean_text,
25
- has_red_text,
26
- extract_red_text_segments,
27
- replace_red_text_in_cell,
28
- key_is_forbidden_for_position,
29
- )
30
-
31
- # -------------------------------------------------------------------
32
- # Small XML helper (kept exactly as before — low-level)
33
- # -------------------------------------------------------------------
34
  def _prev_para_text(tbl):
35
  """Get text from previous paragraph before table"""
36
  prev = tbl._tbl.getprevious()
@@ -40,60 +76,123 @@ def _prev_para_text(tbl):
40
  return ""
41
  return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
42
 
43
- # -------------------------------------------------------------------
44
- # Table context helpers (use normalize_text from hf_utils)
45
- # -------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def fuzzy_match_heading(heading, patterns):
47
- """Check if heading matches any pattern with fuzzy matching"""
48
  if not heading:
49
  return False
50
- heading_norm = normalize_text(heading).upper()
51
  for pattern in patterns:
52
  try:
53
  if re.search(pattern, heading_norm, re.IGNORECASE):
54
  return True
55
  except re.error:
56
- # fallback simple substring if pattern isn't a valid re
57
  if pattern.upper() in heading_norm:
58
  return True
59
  return False
60
 
61
- def get_table_context(tbl):
62
- """Get comprehensive context information for table"""
63
- heading = normalize_text(_prev_para_text(tbl))
64
- # first row headers
65
- headers = [normalize_text(c.text) for c in tbl.rows[0].cells if c.text.strip()] if tbl.rows else []
66
- col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells and r.cells[0].text.strip()]
67
- first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
68
- all_cells = []
69
- for row in tbl.rows:
70
- for cell in row.cells:
71
- text = normalize_text(cell.text)
72
- if text:
73
- all_cells.append(text)
74
- return {
75
- 'heading': heading,
76
- 'headers': headers,
77
- 'col0': col0,
78
- 'first_cell': first_cell,
79
- 'all_cells': all_cells,
80
- 'num_rows': len(tbl.rows),
81
- 'num_cols': len(tbl.rows[0].cells) if tbl.rows else 0
82
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- # -------------------------------------------------------------------
85
- # Scoring / matching logic (kept your behavior but using normalize_text)
86
- # -------------------------------------------------------------------
87
  def calculate_schema_match_score(schema_name, spec, context):
88
- """Enhanced calculate match score - IMPROVED for Vehicle Registration tables"""
89
  score = 0
90
  reasons = []
91
 
92
- # VEHICLE REGISTRATION BOOST
93
  if "Vehicle Registration" in schema_name:
94
  vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
95
- table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
96
- keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
97
  if keyword_matches >= 2:
98
  score += 150
99
  reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
@@ -101,53 +200,52 @@ def calculate_schema_match_score(schema_name, spec, context):
101
  score += 75
102
  reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")
103
 
104
- # SUMMARY TABLE BOOST (existing logic)
105
- if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
106
  score += 100
107
- reasons.append(f"Summary schema with DETAILS column - perfect match")
108
-
109
- if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
110
  score -= 75
111
- reasons.append(f"Non-summary schema penalized for DETAILS column presence")
112
 
113
- # Context exclusions
114
  if spec.get("context_exclusions"):
115
- table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
116
- for exclusion in spec["context_exclusions"]:
117
- if exclusion.lower() in table_text:
118
  score -= 50
119
- reasons.append(f"Context exclusion penalty: '{exclusion}' found")
120
 
121
- # Context keywords
122
  if spec.get("context_keywords"):
123
- table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
124
- keyword_matches = 0
125
- for keyword in spec["context_keywords"]:
126
- if keyword.lower() in table_text:
127
- keyword_matches += 1
128
- if keyword_matches > 0:
129
- score += keyword_matches * 15
130
- reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
131
-
132
- # Direct first cell match
133
- if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
134
  score += 100
135
  reasons.append(f"Direct first cell match: '{context['first_cell']}'")
136
 
137
- # Heading pattern matching
138
  if spec.get("headings"):
139
  for h in spec["headings"]:
140
- if fuzzy_match_heading(context['heading'], [h.get("text", "")]):
 
 
 
 
141
  score += 50
142
  reasons.append(f"Heading match: '{context['heading']}'")
143
  break
144
 
145
- # Column header matching
146
  if spec.get("columns"):
147
- cols = [normalize_text(col) for col in spec["columns"]]
148
  matches = 0
149
  for col in cols:
150
- if any(col.upper() in h.upper() for h in context['headers']):
151
  matches += 1
152
  if matches == len(cols):
153
  score += 60
@@ -156,48 +254,47 @@ def calculate_schema_match_score(schema_name, spec, context):
156
  score += matches * 20
157
  reasons.append(f"Partial column matches: {matches}/{len(cols)}")
158
 
159
- # Label matching for left-oriented tables
160
  if spec.get("orientation") == "left":
161
- labels = [normalize_text(lbl) for lbl in spec["labels"]]
162
  matches = 0
163
  for lbl in labels:
164
- if any(lbl.upper() in c.upper() or c.upper() in lbl.upper() for c in context['col0']):
165
  matches += 1
166
  if matches > 0:
167
- score += (matches / len(labels)) * 30
168
  reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
169
 
170
- # Enhanced Label matching for row1-oriented tables (Vehicle Registration)
171
  elif spec.get("orientation") == "row1":
172
- labels = [normalize_text(lbl) for lbl in spec["labels"]]
173
  matches = 0
174
  for lbl in labels:
175
- if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
176
  matches += 1
177
- elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
178
  matches += 0.5
179
  if matches > 0:
180
- score += (matches / len(labels)) * 40
181
  reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
182
 
183
- # Special handling for Declaration tables
184
- if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
185
- if "OPERATOR DECLARATION" in context['heading'].upper():
186
  score += 80
187
  reasons.append("Operator Declaration context match")
188
- elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
189
  score += 60
190
  reasons.append("Manager found in cells (likely Operator Declaration)")
191
 
192
- if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
193
- if any("MANAGER" in cell.upper() for cell in context['all_cells']):
194
  score -= 50
195
  reasons.append("Penalty: Manager found (not auditor)")
196
 
197
  return score, reasons
198
 
199
  def match_table_schema(tbl):
200
- """Improved table schema matching with scoring system"""
201
  context = get_table_context(tbl)
202
  best_match = None
203
  best_score = 0
@@ -210,23 +307,23 @@ def match_table_schema(tbl):
210
  return best_match
211
  return None
212
 
213
- # -------------------------------------------------------------------
214
- # Multi-schema detection & extraction (kept behavior)
215
- # -------------------------------------------------------------------
216
  def check_multi_schema_table(tbl):
217
- """Check if table contains multiple schemas and split appropriately"""
218
  context = get_table_context(tbl)
219
- operator_labels = ["Operator name (Legal entity)", "NHVAS Accreditation No.", "Registered trading name/s",
220
- "Australian Company Number", "NHVAS Manual"]
 
 
221
  contact_labels = ["Operator business address", "Operator Postal address", "Email address", "Operator Telephone Number"]
222
- has_operator = any(any(op_lbl.upper() in cell.upper() for op_lbl in operator_labels) for cell in context['col0'])
223
- has_contact = any(any(cont_lbl.upper() in cell.upper() for cont_lbl in contact_labels) for cell in context['col0'])
224
  if has_operator and has_contact:
225
  return ["Operator Information", "Operator contact details"]
226
  return None
227
 
228
  def extract_multi_schema_table(tbl, schemas):
229
- """Extract data from table with multiple schemas"""
230
  result = {}
231
  for schema_name in schemas:
232
  if schema_name not in TABLE_SCHEMAS:
@@ -239,7 +336,7 @@ def extract_multi_schema_table(tbl, schemas):
239
  row_label = normalize_text(row.cells[0].text)
240
  belongs_to_schema = False
241
  matched_label = None
242
- for spec_label in spec["labels"]:
243
  spec_norm = normalize_text(spec_label).upper()
244
  row_norm = row_label.upper()
245
  if spec_norm == row_norm or spec_norm in row_norm or row_norm in spec_norm:
@@ -251,29 +348,26 @@ def extract_multi_schema_table(tbl, schemas):
251
  for ci, cell in enumerate(row.cells):
252
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
253
  if red_txt:
254
- if matched_label not in schema_data:
255
- schema_data[matched_label] = []
256
  if red_txt not in schema_data[matched_label]:
257
  schema_data[matched_label].append(red_txt)
258
  if schema_data:
259
  result[schema_name] = schema_data
260
  return result
261
 
262
- # -------------------------------------------------------------------
263
- # Table extraction for schemas (kept your specialized vehicle handling)
264
- # -------------------------------------------------------------------
265
  def extract_table_data(tbl, schema_name, spec):
266
- """Extract red text data from table based on schema - ENHANCED for Vehicle Registration"""
267
-
268
- # Special handling for vehicle registration tables
269
  if "Vehicle Registration" in schema_name:
270
  print(f" 🚗 EXTRACTION FIX: Processing Vehicle Registration table")
271
- labels = spec["labels"]
272
  collected = {lbl: [] for lbl in labels}
273
  seen = {lbl: set() for lbl in labels}
274
 
275
  if len(tbl.rows) < 2:
276
- print(f" ❌ Vehicle table has less than 2 rows")
277
  return {}
278
 
279
  header_row = tbl.rows[0]
@@ -285,38 +379,40 @@ def extract_table_data(tbl, schema_name, spec):
285
  header_text = normalize_text(cell.text).strip()
286
  if not header_text:
287
  continue
288
-
289
  print(f" Column {col_idx}: '{header_text}'")
290
-
291
- best_match = None
292
- best_score = 0
293
-
294
- for label in labels:
295
- if header_text.upper() == label.upper():
296
- best_match = label
297
- best_score = 1.0
298
- break
299
-
300
- header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
301
- label_words = set(word.upper() for word in label.split() if len(word) > 2)
302
-
303
- if header_words and label_words:
304
- common_words = header_words.intersection(label_words)
305
- if common_words:
306
- score = len(common_words) / max(len(header_words), len(label_words))
307
- if score > best_score and score >= 0.4:
308
- best_score = score
309
- best_match = label
310
-
311
- if best_match:
312
- column_mapping[col_idx] = best_match
313
- print(f" ✅ Mapped to: '{best_match}' (score: {best_score:.2f})")
314
  else:
315
- print(f" ⚠️ No mapping found for '{header_text}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  print(f" 📊 Total column mappings: {len(column_mapping)}")
318
 
319
- # Extract red text from data rows (skip header)
320
  for row_idx in range(1, len(tbl.rows)):
321
  row = tbl.rows[row_idx]
322
  print(f" 📌 Processing data row {row_idx}")
@@ -326,14 +422,14 @@ def extract_table_data(tbl, schema_name, spec):
326
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
327
  if red_txt:
328
  print(f" 🔴 Found red text in '{label}': '{red_txt}'")
329
- if red_txt not in seen[label]:
330
  seen[label].add(red_txt)
331
- collected[label].append(red_txt)
332
  result = {k: v for k, v in collected.items() if v}
333
  print(f" ✅ Vehicle Registration extracted: {len(result)} columns with data")
334
  return result
335
 
336
- # FALLBACK: original extraction logic for other tables
337
  labels = spec.get("labels", []) + [schema_name]
338
  collected = {lbl: [] for lbl in labels}
339
  seen = {lbl: set() for lbl in labels}
@@ -367,19 +463,15 @@ def extract_table_data(tbl, schema_name, spec):
367
  break
368
  if not lbl:
369
  lbl = schema_name
370
- if red_txt not in seen[lbl]:
371
  seen[lbl].add(red_txt)
372
- collected[lbl].append(red_txt)
373
  return {k: v for k, v in collected.items() if v}
374
 
375
- # -------------------------------------------------------------------
376
- # Main extraction: iterate tables & paragraphs
377
- # -------------------------------------------------------------------
378
  def extract_red_text(input_doc):
379
- """
380
- input_doc: docx.Document object or file path
381
- returns: dict
382
- """
383
  if isinstance(input_doc, str):
384
  doc = Document(input_doc)
385
  else:
@@ -389,76 +481,70 @@ def extract_red_text(input_doc):
389
 
390
  for tbl in doc.tables:
391
  table_count += 1
392
- # Check multi-schema table first
393
  multi_schemas = check_multi_schema_table(tbl)
394
  if multi_schemas:
395
  multi_data = extract_multi_schema_table(tbl, multi_schemas)
396
  for schema_name, schema_data in multi_data.items():
397
  if schema_data:
398
- if schema_name in out:
399
- for k, v in schema_data.items():
400
- if k in out[schema_name]:
401
- out[schema_name][k].extend(v)
402
- else:
403
- out[schema_name][k] = v
404
- else:
405
- out[schema_name] = schema_data
406
  continue
407
 
408
  schema = match_table_schema(tbl)
409
  if not schema:
410
- # keep scanning for tables even if no schema matched
411
  continue
412
  spec = TABLE_SCHEMAS[schema]
413
  data = extract_table_data(tbl, schema, spec)
414
  if data:
415
- if schema in out:
416
- for k, v in data.items():
417
- if k in out[schema]:
418
- out[schema][k].extend(v)
419
- else:
420
- out[schema][k] = v
421
- else:
422
- out[schema] = data
423
-
424
- # paragraphs
425
  paras = {}
426
  for idx, para in enumerate(doc.paragraphs):
427
  red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
428
  if not red_txt:
429
  continue
430
 
431
- # find context heading by scanning backward
432
  context = None
433
- for j in range(idx-1, -1, -1):
434
  txt = normalize_text(doc.paragraphs[j].text)
435
  if txt:
436
- all_patterns = HEADING_PATTERNS["main"] + HEADING_PATTERNS["sub"]
437
- if any(re.search(p, txt, re.IGNORECASE) for p in all_patterns):
438
  context = txt
439
  break
440
 
441
- # if it's date-like and matches date pattern, set context to Date
442
- if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
443
  context = "Date"
444
 
445
  if not context:
446
  context = "(para)"
447
- paras.setdefault(context, []).append(red_txt)
 
 
448
 
449
  if paras:
450
  out["paragraphs"] = paras
451
  return out
452
 
453
- # -------------------------------------------------------------------
454
- # File-like wrapper (keeps API used elsewhere)
455
- # -------------------------------------------------------------------
456
  def extract_red_text_filelike(input_file, output_file):
457
- """
458
- Accepts:
459
- input_file: file-like object (BytesIO/File) or path
460
- output_file: file-like object (opened for writing text) or path
461
- """
462
  if hasattr(input_file, "seek"):
463
  input_file.seek(0)
464
  doc = Document(input_file)
@@ -471,16 +557,17 @@ def extract_red_text_filelike(input_file, output_file):
471
  json.dump(result, f, indent=2, ensure_ascii=False)
472
  return result
473
 
474
- # -------------------------------------------------------------------
475
- # CLI entrypoint (preserve original UX)
476
- # -------------------------------------------------------------------
477
  if __name__ == "__main__":
478
  if len(sys.argv) == 3:
479
  input_docx = sys.argv[1]
480
  output_json = sys.argv[2]
481
  doc = Document(input_docx)
482
  word_data = extract_red_text(doc)
483
- with open(output_json, 'w', encoding='utf-8') as f:
 
484
  json.dump(word_data, f, indent=2, ensure_ascii=False)
485
  print(json.dumps(word_data, indent=2, ensure_ascii=False))
486
  else:
 
1
  #!/usr/bin/env python3
2
  """
3
  extract_red_text.py
4
+ Hardened version: preserves original logic/prints while improving header-label mapping,
5
+ robustness to missing hf_utils and better synonym handling for vehicle tables.
6
  """
7
 
8
  import re
 
11
  from docx import Document
12
  from docx.oxml.ns import qn
13
 
14
+ # Try to reuse your hf_utils if available (non-breaking); otherwise fall back to local helpers.
15
+ try:
16
+ from hf_utils import (
17
+ is_red_font,
18
+ normalize_text,
19
+ normalize_header_text,
20
+ get_clean_text,
21
+ )
22
+ except Exception:
23
+ # Minimal compatible fallbacks if hf_utils is not present.
24
+ def normalize_text(s: str) -> str:
25
+ if not s:
26
+ return ""
27
+ s = re.sub(r"\u2013|\u2014", "-", s) # smart dashes
28
+ s = re.sub(r"[^\w\s\-\&\(\)\/:]", " ", s) # keep a small set of punctuation
29
+ s = re.sub(r"\s+", " ", s).strip()
30
+ return s
31
+
32
+ def normalize_header_text(s: str) -> str:
33
+ return normalize_text(s).upper()
34
+
35
+ def is_red_font(run):
36
+ """Best-effort red detection fallback for when hf_utils isn't available."""
37
+ try:
38
+ col = getattr(run.font, "color", None)
39
+ if col and getattr(col, "rgb", None):
40
+ rgb = col.rgb
41
+ r, g, b = rgb[0], rgb[1], rgb[2]
42
+ if r > 150 and g < 120 and b < 120 and (r - max(g, b)) > 30:
43
+ return True
44
+ except Exception:
45
+ pass
46
+ # fallback to xml check
47
+ try:
48
+ rPr = getattr(run._element, "rPr", None)
49
+ if rPr is not None:
50
+ clr = rPr.find(qn('w:color'))
51
+ if clr is not None:
52
+ val = clr.get(qn('w:val'))
53
+ if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
54
+ rr, gg, bb = int(val[:2], 16), int(val[2:4], 16), int(val[4:], 16)
55
+ if rr > 150 and gg < 120 and bb < 120 and (rr - max(gg, bb)) > 30:
56
+ return True
57
+ except Exception:
58
+ pass
59
+ return False
60
+
61
+ def get_clean_text(elem):
62
+ return "".join(node.text for node in elem.iter() if node.tag.endswith("}t") and node.text).strip()
63
+
64
+ # Import master schemas and patterns (your file)
65
  from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
66
 
67
+ # ---------------------------------------------------------------------
68
+ # Low-level helpers (kept and hardened)
69
+ # ---------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  def _prev_para_text(tbl):
71
  """Get text from previous paragraph before table"""
72
  prev = tbl._tbl.getprevious()
 
76
  return ""
77
  return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
78
 
79
+ def get_table_context(tbl):
80
+ """Return structured context for a table"""
81
+ heading = normalize_text(_prev_para_text(tbl))
82
+ headers = [normalize_text(c.text) for c in tbl.rows[0].cells if c.text.strip()] if tbl.rows else []
83
+ col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells and r.cells[0].text.strip()]
84
+ first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
85
+ all_cells = []
86
+ for row in tbl.rows:
87
+ for cell in row.cells:
88
+ t = normalize_text(cell.text)
89
+ if t:
90
+ all_cells.append(t)
91
+ return {
92
+ "heading": heading,
93
+ "headers": headers,
94
+ "col0": col0,
95
+ "first_cell": first_cell,
96
+ "all_cells": all_cells,
97
+ "num_rows": len(tbl.rows),
98
+ "num_cols": len(tbl.rows[0].cells) if tbl.rows else 0,
99
+ }
100
+
101
  def fuzzy_match_heading(heading, patterns):
102
+ """Return True if heading fuzzy-matches any regex patterns"""
103
  if not heading:
104
  return False
105
+ heading_norm = heading.upper()
106
  for pattern in patterns:
107
  try:
108
  if re.search(pattern, heading_norm, re.IGNORECASE):
109
  return True
110
  except re.error:
 
111
  if pattern.upper() in heading_norm:
112
  return True
113
  return False
114
 
115
+ # ---------------------------------------------------------------------
116
+ # Header-to-label synonym map: improved coverage for common OCR/header variants
117
+ # ---------------------------------------------------------------------
118
+ HEADER_SYNONYMS = {
119
+ # normalized header (upper) -> canonical label in TABLE_SCHEMAS
120
+ "NO": "No.",
121
+ "NO.": "No.",
122
+ "REG NO": "Registration Number",
123
+ "REGISTRATIONNO": "Registration Number",
124
+ "REGISTRATION NUMBER": "Registration Number",
125
+ "REGISTRATION": "Registration Number",
126
+ "PRINT NAME": "Print Name",
127
+ "NHVR OR EXEMPLAR GLOBAL AUDITOR REGISTRATION NUMBER": "NHVR or Exemplar Global Auditor Registration Number",
128
+ "ROADWORTHINESS CERTIFICATES": "Roadworthiness Certificates",
129
+ "ROADWORTHINESS CERTIFICATES (APPLICABLE FOR ENTRY AUDIT)": "Roadworthiness Certificates",
130
+ "MAINTENANCE RECORDS": "Maintenance Records",
131
+ "DAILY CHECKS": "Daily Checks",
132
+ "FAULT RECORDING/ REPORTING": "Fault Recording/ Reporting",
133
+ "FAULT RECORDING/REPORTING": "Fault Recording/ Reporting",
134
+ "FAULT REPAIR": "Fault Repair",
135
+ "WEIGHT VERIFICATION RECORDS": "Weight Verification Records",
136
+ "RFS SUSPENSION CERTIFICATION #": "RFS Suspension Certification #",
137
+ "SUSPENSION SYSTEM MAINTENANCE": "Suspension System Maintenance",
138
+ "TRIP RECORDS": "Trip Records",
139
+ "FAULT RECORDING/ REPORTING ON SUSPENSION SYSTEM": "Fault Recording/ Reporting",
140
+ # short forms
141
+ "REG NO.": "Registration Number",
142
+ "REGISTRATION #": "Registration Number",
143
+ }
144
+
145
+ def map_header_to_label(header_text, labels):
146
+ """
147
+ Given a header_text (raw) and list of candidate labels (from schema),
148
+ return the best matching label or None.
149
+ """
150
+ if not header_text:
151
+ return None
152
+ hnorm = normalize_header_text(header_text)
153
+ # exact synonym map
154
+ for key, lab in HEADER_SYNONYMS.items():
155
+ if key in hnorm:
156
+ # ensure lab exists in candidate labels (case-insensitive)
157
+ for cand in labels:
158
+ if normalize_header_text(cand) == normalize_header_text(lab):
159
+ return cand
160
+ # if it isn't in labels, still return the lab (labels sometimes omit punctuation)
161
+ return lab
162
+
163
+ # try exact match to any candidate label
164
+ for cand in labels:
165
+ if normalize_header_text(cand) == hnorm:
166
+ return cand
167
+
168
+ # token overlap scoring (flexible)
169
+ header_words = [w for w in re.split(r"\W+", header_text) if len(w) > 2]
170
+ best = (None, 0.0)
171
+ for cand in labels:
172
+ cand_words = [w for w in re.split(r"\W+", cand) if len(w) > 2]
173
+ if not cand_words or not header_words:
174
+ continue
175
+ common = set(w.upper() for w in header_words).intersection(set(w.upper() for w in cand_words))
176
+ score = len(common) / max(1, max(len(header_words), len(cand_words)))
177
+ if score > best[1]:
178
+ best = (cand, score)
179
+ # lower threshold for vehicle tables / noisy OCR (accept >= 0.25)
180
+ if best[1] >= 0.25:
181
+ return best[0]
182
+ return None
183
 
184
+ # ---------------------------------------------------------------------
185
+ # Matching / scoring logic (keeps original heuristics)
186
+ # ---------------------------------------------------------------------
187
  def calculate_schema_match_score(schema_name, spec, context):
 
188
  score = 0
189
  reasons = []
190
 
191
+ # Vehicle registration boost
192
  if "Vehicle Registration" in schema_name:
193
  vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
194
+ table_text = " ".join(context["headers"]).lower() + " " + context["heading"].lower()
195
+ keyword_matches = sum(1 for k in vehicle_keywords if k in table_text)
196
  if keyword_matches >= 2:
197
  score += 150
198
  reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
 
200
  score += 75
201
  reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")
202
 
203
+ # Summary boost
204
+ if "Summary" in schema_name and "details" in " ".join(context["headers"]).lower():
205
  score += 100
206
+ reasons.append("Summary schema with DETAILS column - perfect match")
207
+ if "Summary" not in schema_name and "details" in " ".join(context["headers"]).lower():
 
208
  score -= 75
209
+ reasons.append("Non-summary schema penalized for DETAILS column presence")
210
 
211
+ # context exclusions & keywords
212
  if spec.get("context_exclusions"):
213
+ table_text = " ".join(context["headers"]).lower() + " " + context["heading"].lower()
214
+ for exc in spec["context_exclusions"]:
215
+ if exc.lower() in table_text:
216
  score -= 50
217
+ reasons.append(f"Context exclusion penalty: '{exc}'")
218
 
 
219
  if spec.get("context_keywords"):
220
+ table_text = " ".join(context["headers"]).lower() + " " + context["heading"].lower()
221
+ matches = sum(1 for kw in spec["context_keywords"] if kw.lower() in table_text)
222
+ if matches:
223
+ score += matches * 15
224
+ reasons.append(f"Context keyword matches: {matches}/{len(spec['context_keywords'])}")
225
+
226
+ # direct first-cell match
227
+ if context["first_cell"] and context["first_cell"].upper() == schema_name.upper():
 
 
 
228
  score += 100
229
  reasons.append(f"Direct first cell match: '{context['first_cell']}'")
230
 
231
+ # heading pattern
232
  if spec.get("headings"):
233
  for h in spec["headings"]:
234
+ if isinstance(h, dict):
235
+ text = h.get("text", "")
236
+ else:
237
+ text = h
238
+ if fuzzy_match_heading(context["heading"], [text]):
239
  score += 50
240
  reasons.append(f"Heading match: '{context['heading']}'")
241
  break
242
 
243
+ # columns matching
244
  if spec.get("columns"):
245
+ cols = [normalize_text(c) for c in spec["columns"]]
246
  matches = 0
247
  for col in cols:
248
+ if any(col.upper() in h.upper() for h in context["headers"]):
249
  matches += 1
250
  if matches == len(cols):
251
  score += 60
 
254
  score += matches * 20
255
  reasons.append(f"Partial column matches: {matches}/{len(cols)}")
256
 
257
+ # left orientation
258
  if spec.get("orientation") == "left":
259
+ labels = [normalize_text(lbl) for lbl in spec.get("labels", [])]
260
  matches = 0
261
  for lbl in labels:
262
+ if any(lbl.upper() in c.upper() or c.upper() in lbl.upper() for c in context["col0"]):
263
  matches += 1
264
  if matches > 0:
265
+ score += (matches / max(1, len(labels))) * 30
266
  reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
267
 
268
+ # row1 orientation
269
  elif spec.get("orientation") == "row1":
270
+ labels = [normalize_text(lbl) for lbl in spec.get("labels", [])]
271
  matches = 0
272
  for lbl in labels:
273
+ if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context["headers"]):
274
  matches += 1
275
+ elif any(word.upper() in " ".join(context["headers"]).upper() for word in lbl.split() if len(word) > 3):
276
  matches += 0.5
277
  if matches > 0:
278
+ score += (matches / max(1, len(labels))) * 40
279
  reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
280
 
281
+ # Declarations special cases
282
+ if schema_name == "Operator Declaration" and context["first_cell"].upper().startswith("PRINT"):
283
+ if "OPERATOR DECLARATION" in context["heading"].upper():
284
  score += 80
285
  reasons.append("Operator Declaration context match")
286
+ elif any("MANAGER" in cell.upper() for cell in context["all_cells"]):
287
  score += 60
288
  reasons.append("Manager found in cells (likely Operator Declaration)")
289
 
290
+ if schema_name == "NHVAS Approved Auditor Declaration" and context["first_cell"].upper().startswith("PRINT"):
291
+ if any("MANAGER" in cell.upper() for cell in context["all_cells"]):
292
  score -= 50
293
  reasons.append("Penalty: Manager found (not auditor)")
294
 
295
  return score, reasons
296
 
297
  def match_table_schema(tbl):
 
298
  context = get_table_context(tbl)
299
  best_match = None
300
  best_score = 0
 
307
  return best_match
308
  return None
309
 
310
+ # ---------------------------------------------------------------------
311
+ # Multi-schema detection & extraction (keeps original behavior)
312
+ # ---------------------------------------------------------------------
313
  def check_multi_schema_table(tbl):
 
314
  context = get_table_context(tbl)
315
+ operator_labels = [
316
+ "Operator name (Legal entity)", "NHVAS Accreditation No.", "Registered trading name/s",
317
+ "Australian Company Number", "NHVAS Manual"
318
+ ]
319
  contact_labels = ["Operator business address", "Operator Postal address", "Email address", "Operator Telephone Number"]
320
+ has_operator = any(any(op_lbl.upper() in cell.upper() for op_lbl in operator_labels) for cell in context["col0"])
321
+ has_contact = any(any(cont_lbl.upper() in cell.upper() for cont_lbl in contact_labels) for cell in context["col0"])
322
  if has_operator and has_contact:
323
  return ["Operator Information", "Operator contact details"]
324
  return None
325
 
326
  def extract_multi_schema_table(tbl, schemas):
 
327
  result = {}
328
  for schema_name in schemas:
329
  if schema_name not in TABLE_SCHEMAS:
 
336
  row_label = normalize_text(row.cells[0].text)
337
  belongs_to_schema = False
338
  matched_label = None
339
+ for spec_label in spec.get("labels", []):
340
  spec_norm = normalize_text(spec_label).upper()
341
  row_norm = row_label.upper()
342
  if spec_norm == row_norm or spec_norm in row_norm or row_norm in spec_norm:
 
348
  for ci, cell in enumerate(row.cells):
349
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
350
  if red_txt:
351
+ schema_data.setdefault(matched_label, [])
 
352
  if red_txt not in schema_data[matched_label]:
353
  schema_data[matched_label].append(red_txt)
354
  if schema_data:
355
  result[schema_name] = schema_data
356
  return result
357
 
358
+ # ---------------------------------------------------------------------
359
+ # Extraction: special-case for Vehicle Registration tables (row1) and generic fallback
360
+ # ---------------------------------------------------------------------
361
  def extract_table_data(tbl, schema_name, spec):
362
+ # Vehicle registration special handling
 
 
363
  if "Vehicle Registration" in schema_name:
364
  print(f" 🚗 EXTRACTION FIX: Processing Vehicle Registration table")
365
+ labels = spec.get("labels", [])
366
  collected = {lbl: [] for lbl in labels}
367
  seen = {lbl: set() for lbl in labels}
368
 
369
  if len(tbl.rows) < 2:
370
+ print(" ❌ Vehicle table has less than 2 rows")
371
  return {}
372
 
373
  header_row = tbl.rows[0]
 
379
  header_text = normalize_text(cell.text).strip()
380
  if not header_text:
381
  continue
 
382
  print(f" Column {col_idx}: '{header_text}'")
383
+ mapped = map_header_to_label(header_text, labels)
384
+ if mapped:
385
+ # find exact candidate label string (preserve original label spelling if possible)
386
+ chosen = None
387
+ for cand in labels:
388
+ if normalize_header_text(cand) == normalize_header_text(mapped):
389
+ chosen = cand
390
+ break
391
+ column_mapping[col_idx] = chosen or mapped
392
+ print(f" ✅ Mapped to: '{column_mapping[col_idx]}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  else:
394
+ # fallback: try fuzzy token overlap directly with candidate labels
395
+ best = None
396
+ best_score = 0.0
397
+ hwords = [w for w in re.split(r"\W+", header_text) if len(w) > 2]
398
+ for cand in labels:
399
+ cwords = [w for w in re.split(r"\W+", cand) if len(w) > 2]
400
+ if not cwords or not hwords:
401
+ continue
402
+ common = set(w.upper() for w in hwords).intersection(set(w.upper() for w in cwords))
403
+ score = len(common) / max(1, max(len(hwords), len(cwords)))
404
+ if score > best_score:
405
+ best = cand
406
+ best_score = score
407
+ if best and best_score >= 0.25:
408
+ column_mapping[col_idx] = best
409
+ print(f" ✅ Fuzzy-mapped to: '{best}' (score: {best_score:.2f})")
410
+ else:
411
+ print(f" ⚠️ No mapping found for '{header_text}'")
412
 
413
  print(f" 📊 Total column mappings: {len(column_mapping)}")
414
 
415
+ # Extract red text from data rows
416
  for row_idx in range(1, len(tbl.rows)):
417
  row = tbl.rows[row_idx]
418
  print(f" 📌 Processing data row {row_idx}")
 
422
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
423
  if red_txt:
424
  print(f" 🔴 Found red text in '{label}': '{red_txt}'")
425
+ if red_txt not in seen.setdefault(label, set()):
426
  seen[label].add(red_txt)
427
+ collected.setdefault(label, []).append(red_txt)
428
  result = {k: v for k, v in collected.items() if v}
429
  print(f" ✅ Vehicle Registration extracted: {len(result)} columns with data")
430
  return result
431
 
432
+ # Generic fallback extraction logic
433
  labels = spec.get("labels", []) + [schema_name]
434
  collected = {lbl: [] for lbl in labels}
435
  seen = {lbl: set() for lbl in labels}
 
463
  break
464
  if not lbl:
465
  lbl = schema_name
466
+ if red_txt not in seen.setdefault(lbl, set()):
467
  seen[lbl].add(red_txt)
468
+ collected.setdefault(lbl, []).append(red_txt)
469
  return {k: v for k, v in collected.items() if v}
470
 
471
+ # ---------------------------------------------------------------------
472
+ # Main extraction: process all tables then paragraphs
473
+ # ---------------------------------------------------------------------
474
  def extract_red_text(input_doc):
 
 
 
 
475
  if isinstance(input_doc, str):
476
  doc = Document(input_doc)
477
  else:
 
481
 
482
  for tbl in doc.tables:
483
  table_count += 1
 
484
  multi_schemas = check_multi_schema_table(tbl)
485
  if multi_schemas:
486
  multi_data = extract_multi_schema_table(tbl, multi_schemas)
487
  for schema_name, schema_data in multi_data.items():
488
  if schema_data:
489
+ # merge safely and dedupe
490
+ existing = out.get(schema_name, {})
491
+ for k, v in schema_data.items():
492
+ existing.setdefault(k, [])
493
+ for val in v:
494
+ if val not in existing[k]:
495
+ existing[k].append(val)
496
+ out[schema_name] = existing
497
  continue
498
 
499
  schema = match_table_schema(tbl)
500
  if not schema:
 
501
  continue
502
  spec = TABLE_SCHEMAS[schema]
503
  data = extract_table_data(tbl, schema, spec)
504
  if data:
505
+ existing = out.get(schema, {})
506
+ for k, v in data.items():
507
+ existing.setdefault(k, [])
508
+ for val in v:
509
+ if val not in existing[k]:
510
+ existing[k].append(val)
511
+ out[schema] = existing
512
+
513
+ # Paragraph red-text extraction with context
 
514
  paras = {}
515
  for idx, para in enumerate(doc.paragraphs):
516
  red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
517
  if not red_txt:
518
  continue
519
 
520
+ # find a heading context by scanning backwards
521
  context = None
522
+ for j in range(idx - 1, -1, -1):
523
  txt = normalize_text(doc.paragraphs[j].text)
524
  if txt:
525
+ patterns = HEADING_PATTERNS["main"] + HEADING_PATTERNS["sub"]
526
+ if any(re.search(p, txt, re.IGNORECASE) for p in patterns):
527
  context = txt
528
  break
529
 
530
+ # special-case date-like lines
531
+ if not context and re.fullmatch(PARAGRAPH_PATTERNS.get("date_line", r".*"), red_txt):
532
  context = "Date"
533
 
534
  if not context:
535
  context = "(para)"
536
+ paras.setdefault(context, [])
537
+ if red_txt not in paras[context]:
538
+ paras[context].append(red_txt)
539
 
540
  if paras:
541
  out["paragraphs"] = paras
542
  return out
543
 
544
+ # ---------------------------------------------------------------------
545
+ # File wrapper to support your existing calls
546
+ # ---------------------------------------------------------------------
547
  def extract_red_text_filelike(input_file, output_file):
 
 
 
 
 
548
  if hasattr(input_file, "seek"):
549
  input_file.seek(0)
550
  doc = Document(input_file)
 
557
  json.dump(result, f, indent=2, ensure_ascii=False)
558
  return result
559
 
560
+ # ---------------------------------------------------------------------
561
+ # CLI entrypoint (same as before)
562
+ # ---------------------------------------------------------------------
563
  if __name__ == "__main__":
564
  if len(sys.argv) == 3:
565
  input_docx = sys.argv[1]
566
  output_json = sys.argv[2]
567
  doc = Document(input_docx)
568
  word_data = extract_red_text(doc)
569
+ # write file (dedupe already handled in merging logic above)
570
+ with open(output_json, "w", encoding="utf-8") as f:
571
  json.dump(word_data, f, indent=2, ensure_ascii=False)
572
  print(json.dumps(word_data, indent=2, ensure_ascii=False))
573
  else: