Shami96 commited on
Commit
da7e8af
Β·
verified Β·
1 Parent(s): 4451af2

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +350 -384
updated_word.py CHANGED
@@ -1,7 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  from docx import Document
3
  from docx.shared import RGBColor
4
  import re
 
5
 
6
  # Heading patterns for document structure detection
7
  HEADING_PATTERNS = {
@@ -32,7 +45,7 @@ HEADING_PATTERNS = {
32
  # ============================================================================
33
 
34
  def load_json(filepath):
35
- with open(filepath, 'r') as file:
36
  return json.load(file)
37
 
38
  def flatten_json(y, prefix=''):
@@ -48,7 +61,12 @@ def flatten_json(y, prefix=''):
48
 
49
  def is_red(run):
50
  color = run.font.color
51
- return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
 
 
 
 
 
52
 
53
  def get_value_as_string(value, field_name=""):
54
  if isinstance(value, list):
@@ -90,82 +108,79 @@ def has_red_text_in_paragraph(paragraph):
90
 
91
  def find_matching_json_value(field_name, flat_json):
92
  """Find matching value in JSON with multiple strategies"""
93
- field_name = field_name.strip()
94
-
 
 
95
  # Try exact match first
96
  if field_name in flat_json:
97
  print(f" βœ… Direct match found for key '{field_name}'")
98
  return flat_json[field_name]
99
-
100
  # Try case-insensitive exact match
101
  for key, value in flat_json.items():
102
  if key.lower() == field_name.lower():
103
  print(f" βœ… Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
104
  return value
105
-
106
- # Better Print Name detection for operator vs auditor
107
  if field_name.lower().strip() == "print name":
108
  operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
109
  auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
110
-
111
  if operator_keys:
112
  print(f" βœ… Operator Print Name match: '{field_name}' -> '{operator_keys[0]}'")
113
  return flat_json[operator_keys[0]]
114
  elif auditor_keys:
115
  print(f" βœ… Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
116
  return flat_json[auditor_keys[0]]
117
-
118
  # Try suffix matching (for nested keys like "section.field")
119
  for key, value in flat_json.items():
120
  if '.' in key and key.split('.')[-1].lower() == field_name.lower():
121
  print(f" βœ… Suffix match found for key '{field_name}' with JSON key '{key}'")
122
  return value
123
-
124
- # Try partial matching - remove parentheses and special chars
125
  clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
126
  clean_field = re.sub(r'\s+', ' ', clean_field)
127
-
128
  for key, value in flat_json.items():
129
  clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
130
  clean_key = re.sub(r'\s+', ' ', clean_key)
131
-
132
  if clean_field == clean_key:
133
  print(f" βœ… Clean match found for key '{field_name}' with JSON key '{key}'")
134
  return value
135
-
136
  # Enhanced fuzzy matching with better scoring
137
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
138
  if not field_words:
139
  return None
140
-
141
  best_match = None
142
  best_score = 0
143
  best_key = None
144
-
145
  for key, value in flat_json.items():
146
  key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
147
  if not key_words:
148
  continue
149
-
150
- # Calculate similarity score
151
  common_words = field_words.intersection(key_words)
152
  if common_words:
153
- # Use Jaccard similarity: intersection / union
154
  similarity = len(common_words) / len(field_words.union(key_words))
155
-
156
- # Bonus for high word coverage in field_name
157
  coverage = len(common_words) / len(field_words)
158
  final_score = (similarity * 0.6) + (coverage * 0.4)
159
-
160
  if final_score > best_score:
161
  best_score = final_score
162
  best_match = value
163
  best_key = key
164
-
165
  if best_match and best_score >= 0.25:
166
  print(f" βœ… Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
167
  return best_match
168
-
169
  print(f" ❌ No match found for '{field_name}'")
170
  return None
171
 
@@ -176,11 +191,11 @@ def find_matching_json_value(field_name, flat_json):
176
  def extract_red_text_segments(cell):
177
  """Extract red text segments from a cell"""
178
  red_segments = []
179
-
180
  for para_idx, paragraph in enumerate(cell.paragraphs):
181
  current_segment = ""
182
  segment_runs = []
183
-
184
  for run_idx, run in enumerate(paragraph.runs):
185
  if is_red(run):
186
  if run.text:
@@ -196,7 +211,7 @@ def extract_red_text_segments(cell):
196
  })
197
  current_segment = ""
198
  segment_runs = []
199
-
200
  # Handle segment at end of paragraph
201
  if segment_runs:
202
  red_segments.append({
@@ -204,21 +219,21 @@ def extract_red_text_segments(cell):
204
  'runs': segment_runs.copy(),
205
  'paragraph_idx': para_idx
206
  })
207
-
208
  return red_segments
209
 
210
  def replace_all_red_segments(red_segments, replacement_text):
211
  """Replace all red segments with replacement text"""
212
  if not red_segments:
213
  return 0
214
-
215
  if '\n' in replacement_text:
216
  replacement_lines = replacement_text.split('\n')
217
  else:
218
  replacement_lines = [replacement_text]
219
-
220
  replacements_made = 0
221
-
222
  if red_segments and replacement_lines:
223
  first_segment = red_segments[0]
224
  if first_segment['runs']:
@@ -226,56 +241,57 @@ def replace_all_red_segments(red_segments, replacement_text):
226
  first_run.text = replacement_lines[0]
227
  first_run.font.color.rgb = RGBColor(0, 0, 0)
228
  replacements_made = 1
229
-
230
  for _, _, run in first_segment['runs'][1:]:
231
  run.text = ''
232
-
233
  for segment in red_segments[1:]:
234
  for _, _, run in segment['runs']:
235
  run.text = ''
236
-
237
  if len(replacement_lines) > 1 and red_segments:
238
  try:
239
  first_run = red_segments[0]['runs'][0][2]
240
  paragraph = first_run.element.getparent()
241
-
 
 
242
  for line in replacement_lines[1:]:
243
  if line.strip():
244
- from docx.oxml import OxmlElement
245
  br = OxmlElement('w:br')
246
  first_run.element.append(br)
247
-
248
  new_run = paragraph.add_run(line.strip())
249
  new_run.font.color.rgb = RGBColor(0, 0, 0)
250
- except:
251
  if red_segments and red_segments[0]['runs']:
252
  first_run = red_segments[0]['runs'][0][2]
253
  first_run.text = ' '.join(replacement_lines)
254
  first_run.font.color.rgb = RGBColor(0, 0, 0)
255
-
256
  return replacements_made
257
 
258
  def replace_single_segment(segment, replacement_text):
259
  """Replace a single red text segment"""
260
  if not segment['runs']:
261
  return False
262
-
263
  first_run = segment['runs'][0][2]
264
  first_run.text = replacement_text
265
  first_run.font.color.rgb = RGBColor(0, 0, 0)
266
-
267
  for _, _, run in segment['runs'][1:]:
268
  run.text = ''
269
-
270
  return True
271
 
272
  def replace_red_text_in_cell(cell, replacement_text):
273
  """Replace red text in a cell with replacement text"""
274
  red_segments = extract_red_text_segments(cell)
275
-
276
  if not red_segments:
277
  return 0
278
-
279
  return replace_all_red_segments(red_segments, replacement_text)
280
 
281
  # ============================================================================
@@ -298,132 +314,132 @@ def handle_australian_company_number(row, company_numbers):
298
  def handle_vehicle_registration_table(table, flat_json):
299
  """Handle vehicle registration table data replacement"""
300
  replacements_made = 0
301
-
302
  # Try to find vehicle registration data
303
  vehicle_section = None
304
-
305
  for key, value in flat_json.items():
306
  if "vehicle registration numbers of records examined" in key.lower():
307
  if isinstance(value, dict):
308
  vehicle_section = value
309
  print(f" βœ… Found vehicle data in key: '{key}'")
310
  break
311
-
312
  if not vehicle_section:
313
  potential_columns = {}
314
  for key, value in flat_json.items():
315
- if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension"]):
316
  if "." in key:
317
  column_name = key.split(".")[-1]
318
  else:
319
  column_name = key
320
  potential_columns[column_name] = value
321
-
322
  if potential_columns:
323
  vehicle_section = potential_columns
324
  print(f" βœ… Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
325
  else:
326
  print(f" ❌ Vehicle registration data not found in JSON")
327
  return 0
328
-
329
  print(f" βœ… Found vehicle registration data with {len(vehicle_section)} columns")
330
-
331
  # Find header row
332
  header_row_idx = -1
333
  header_row = None
334
-
335
  for row_idx, row in enumerate(table.rows):
336
  row_text = "".join(get_clean_text(cell).lower() for cell in row.cells)
337
  if "registration" in row_text and "number" in row_text:
338
  header_row_idx = row_idx
339
  header_row = row
340
  break
341
-
342
  if header_row_idx == -1:
343
  print(f" ❌ Could not find header row in vehicle table")
344
  return 0
345
-
346
  print(f" βœ… Found header row at index {header_row_idx}")
347
-
348
- # Enhanced column mapping
349
  column_mapping = {}
350
  for col_idx, cell in enumerate(header_row.cells):
351
  header_text = get_clean_text(cell).strip()
352
  if not header_text or header_text.lower() == "no.":
353
  continue
354
-
355
  best_match = None
356
  best_score = 0
357
-
358
  normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
359
-
360
  for json_key in vehicle_section.keys():
361
  normalized_json = json_key.lower().strip()
362
-
363
  if normalized_header == normalized_json:
364
  best_match = json_key
365
  best_score = 1.0
366
  break
367
-
368
  header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
369
  json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
370
-
371
  if header_words and json_words:
372
  common_words = header_words.intersection(json_words)
373
  score = len(common_words) / max(len(header_words), len(json_words))
374
-
375
  if score > best_score and score >= 0.3:
376
  best_score = score
377
  best_match = json_key
378
-
379
  header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
380
  json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
381
-
382
  if header_clean in json_clean or json_clean in header_clean:
383
  if len(header_clean) > 5 and len(json_clean) > 5:
384
  substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
385
  if substring_score > best_score and substring_score >= 0.6:
386
  best_score = substring_score
387
  best_match = json_key
388
-
389
  if best_match:
390
  column_mapping[col_idx] = best_match
391
  print(f" πŸ“Œ Column {col_idx + 1} ('{header_text}') -> '{best_match}' (score: {best_score:.2f})")
392
-
393
  if not column_mapping:
394
  print(f" ❌ No column mappings found")
395
  return 0
396
-
397
  # Determine data rows needed
398
  max_data_rows = 0
399
  for json_key, data in vehicle_section.items():
400
  if isinstance(data, list):
401
  max_data_rows = max(max_data_rows, len(data))
402
-
403
  print(f" πŸ“Œ Need to populate {max_data_rows} data rows")
404
-
405
  # Process data rows
406
  for data_row_index in range(max_data_rows):
407
  table_row_idx = header_row_idx + 1 + data_row_index
408
-
409
  if table_row_idx >= len(table.rows):
410
  print(f" ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
411
  print(f" βž• Adding new row for vehicle {data_row_index + 1}")
412
-
413
  new_row = table.add_row()
414
  print(f" βœ… Successfully added row {len(table.rows)} to the table")
415
-
416
  row = table.rows[table_row_idx]
417
  print(f" πŸ“Œ Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
418
-
419
  for col_idx, json_key in column_mapping.items():
420
  if col_idx < len(row.cells):
421
  cell = row.cells[col_idx]
422
-
423
  column_data = vehicle_section.get(json_key, [])
424
  if isinstance(column_data, list) and data_row_index < len(column_data):
425
  replacement_value = str(column_data[data_row_index])
426
-
427
  cell_text = get_clean_text(cell)
428
  if has_red_text(cell) or not cell_text.strip():
429
  if not cell_text.strip():
@@ -435,39 +451,39 @@ def handle_vehicle_registration_table(table, flat_json):
435
  replacements_made += cell_replacements
436
  if cell_replacements > 0:
437
  print(f" -> Replaced red text with '{replacement_value}' (column '{json_key}')")
438
-
439
  return replacements_made
440
 
441
  def handle_attendance_list_table_enhanced(table, flat_json):
442
  """Enhanced Attendance List processing with better detection"""
443
  replacements_made = 0
444
-
445
  # Check multiple patterns for attendance list
446
  attendance_patterns = [
447
  "attendance list",
448
  "names and position titles",
449
  "attendees"
450
  ]
451
-
452
  # Scan all cells in the first few rows for attendance list indicators
453
  found_attendance_row = None
454
-
455
  for row_idx, row in enumerate(table.rows[:3]): # Check first 3 rows
456
  for cell_idx, cell in enumerate(row.cells):
457
  cell_text = get_clean_text(cell).lower()
458
-
459
  # Check if this cell contains attendance list header
460
  if any(pattern in cell_text for pattern in attendance_patterns):
461
  found_attendance_row = row_idx
462
  print(f" 🎯 ENHANCED: Found Attendance List in row {row_idx + 1}, cell {cell_idx + 1}")
463
  break
464
-
465
  if found_attendance_row is not None:
466
  break
467
-
468
  if found_attendance_row is None:
469
  return 0
470
-
471
  # Look for attendance data in JSON
472
  attendance_value = None
473
  attendance_search_keys = [
@@ -476,226 +492,226 @@ def handle_attendance_list_table_enhanced(table, flat_json):
476
  "attendance list",
477
  "attendees"
478
  ]
479
-
480
  print(f" πŸ” Searching for attendance data in JSON...")
481
-
482
  for search_key in attendance_search_keys:
483
  attendance_value = find_matching_json_value(search_key, flat_json)
484
  if attendance_value is not None:
485
  print(f" βœ… Found attendance data with key: '{search_key}'")
486
  print(f" πŸ“Š Raw value: {attendance_value}")
487
  break
488
-
489
  if attendance_value is None:
490
  print(f" ❌ No attendance data found in JSON")
491
  return 0
492
-
493
  # Look for red text in ALL cells of the table
494
  target_cell = None
495
-
496
  print(f" πŸ” Scanning ALL cells in attendance table for red text...")
497
-
498
  for row_idx, row in enumerate(table.rows):
499
  for cell_idx, cell in enumerate(row.cells):
500
  if has_red_text(cell):
501
  print(f" 🎯 Found red text in row {row_idx + 1}, cell {cell_idx + 1}")
502
-
503
  # Get the red text to see if it looks like attendance data
504
  red_text = ""
505
  for paragraph in cell.paragraphs:
506
  for run in paragraph.runs:
507
  if is_red(run):
508
  red_text += run.text
509
-
510
  print(f" πŸ“‹ Red text content: '{red_text[:50]}...'")
511
-
512
  # Check if this red text looks like attendance data (contains names/manager/etc)
513
  red_text_lower = red_text.lower()
514
  if any(indicator in red_text_lower for indicator in ['manager', 'herbig', 'palin', '–', '-']):
515
  target_cell = cell
516
  print(f" βœ… This looks like attendance data - using this cell")
517
  break
518
-
519
  if target_cell is not None:
520
  break
521
-
522
  # If no red text found that looks like attendance data, return
523
  if target_cell is None:
524
  print(f" ⚠️ No red text found that looks like attendance data")
525
  return 0
526
-
527
  # Replace red text with properly formatted attendance list
528
  if has_red_text(target_cell):
529
  print(f" πŸ”§ Replacing red text with properly formatted attendance list...")
530
-
531
  # Ensure attendance_value is a list
532
  if isinstance(attendance_value, list):
533
  attendance_list = [str(item).strip() for item in attendance_value if str(item).strip()]
534
  else:
535
  attendance_list = [str(attendance_value).strip()]
536
-
537
  print(f" πŸ“ Attendance items to add:")
538
  for i, item in enumerate(attendance_list):
539
  print(f" {i+1}. {item}")
540
-
541
  # Replace with line-separated attendance list
542
  replacement_text = "\n".join(attendance_list)
543
  cell_replacements = replace_red_text_in_cell(target_cell, replacement_text)
544
  replacements_made += cell_replacements
545
-
546
  print(f" βœ… Added {len(attendance_list)} attendance items")
547
  print(f" πŸ“Š Replacements made: {cell_replacements}")
548
-
549
  return replacements_made
550
 
551
  def fix_management_summary_details_column(table, flat_json):
552
  """Fix the DETAILS column in Management Summary table"""
553
  replacements_made = 0
554
-
555
  print(f" 🎯 FIX: Management Summary DETAILS column processing")
556
-
557
  # Check if this is a Management Summary table
558
  table_text = ""
559
  for row in table.rows[:2]:
560
  for cell in row.cells:
561
  table_text += get_clean_text(cell).lower() + " "
562
-
563
  if not ("mass management" in table_text and "details" in table_text):
564
  return 0
565
-
566
  print(f" βœ… Confirmed Mass Management Summary table")
567
-
568
  # Process each row looking for Std 5. and Std 6. with red text
569
  for row_idx, row in enumerate(table.rows):
570
  if len(row.cells) >= 2:
571
  standard_cell = row.cells[0]
572
  details_cell = row.cells[1]
573
-
574
  standard_text = get_clean_text(standard_cell).strip()
575
-
576
  # Look for Std 5. Verification and Std 6. Internal Review specifically
577
  if "Std 5." in standard_text and "Verification" in standard_text:
578
  if has_red_text(details_cell):
579
  print(f" πŸ” Found Std 5. Verification with red text")
580
-
581
  json_value = find_matching_json_value("Std 5. Verification", flat_json)
582
  if json_value is not None:
583
  replacement_text = get_value_as_string(json_value, "Std 5. Verification")
584
  cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
585
  replacements_made += cell_replacements
586
  print(f" βœ… Replaced Std 5. Verification details")
587
-
588
  elif "Std 6." in standard_text and "Internal Review" in standard_text:
589
  if has_red_text(details_cell):
590
  print(f" πŸ” Found Std 6. Internal Review with red text")
591
-
592
  json_value = find_matching_json_value("Std 6. Internal Review", flat_json)
593
  if json_value is not None:
594
  replacement_text = get_value_as_string(json_value, "Std 6. Internal Review")
595
  cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
596
  replacements_made += cell_replacements
597
  print(f" βœ… Replaced Std 6. Internal Review details")
598
-
599
  return replacements_made
600
 
 
 
 
 
601
  def fix_operator_declaration_empty_values(table, flat_json):
602
  """Fix Operator Declaration table when values are empty or need updating"""
603
  replacements_made = 0
604
-
605
  print(f" 🎯 FIX: Operator Declaration empty values processing")
606
-
607
  # Check if this is an Operator Declaration table
608
  table_context = ""
609
  for row in table.rows:
610
  for cell in row.cells:
611
  table_context += get_clean_text(cell).lower() + " "
612
-
613
  if not ("print name" in table_context and "position title" in table_context):
614
  return 0
615
-
616
  print(f" βœ… Confirmed Operator Declaration table")
617
-
618
  # Find the data row with Print Name and Position Title
619
  for row_idx, row in enumerate(table.rows):
620
  if len(row.cells) >= 2:
621
  cell1_text = get_clean_text(row.cells[0]).strip().lower()
622
  cell2_text = get_clean_text(row.cells[1]).strip().lower()
623
-
624
  # Check if this is the header row
625
  if "print name" in cell1_text and "position" in cell2_text:
626
  print(f" πŸ“Œ Found header row at {row_idx + 1}")
627
-
628
  # Look for the data row (next row)
629
  if row_idx + 1 < len(table.rows):
630
  data_row = table.rows[row_idx + 1]
631
  if len(data_row.cells) >= 2:
632
  name_cell = data_row.cells[0]
633
  position_cell = data_row.cells[1]
634
-
635
  # Check if cells are empty or have red text
636
  name_text = get_clean_text(name_cell).strip()
637
  position_text = get_clean_text(position_cell).strip()
638
-
639
  print(f" πŸ“‹ Current values: Name='{name_text}', Position='{position_text}'")
640
-
641
- # FORCE UPDATE - try direct fields
642
- print(f" πŸ”§ FORCE updating Print Name")
643
  name_value = find_matching_json_value("Operator Declaration.Print Name", flat_json)
 
 
 
644
  if name_value:
645
  new_name = get_value_as_string(name_value).strip()
646
  if new_name and "Pty Ltd" not in new_name and "Company" not in new_name and "Farming" not in new_name:
647
- name_cell.text = new_name # FORCE replace
 
 
 
 
648
  replacements_made += 1
649
  print(f" βœ… FORCE Updated Print Name: '{name_text}' -> '{new_name}'")
650
-
651
- print(f" πŸ”§ FORCE updating Position Title")
652
  position_value = find_matching_json_value("Operator Declaration.Position Title", flat_json)
 
 
653
  if position_value:
654
  new_position = get_value_as_string(position_value).strip()
655
  if new_position:
656
- position_cell.text = new_position # FORCE replace
 
 
 
657
  replacements_made += 1
658
  print(f" βœ… FORCE Updated Position Title: '{position_text}' -> '{new_position}'")
659
-
660
- # If still no updates, try alternative sources
661
- if replacements_made == 0:
662
- print(f" πŸ”§ Trying alternative sources...")
663
-
664
- # Try Print Name alternatives
665
- alt_name_sources = ["Print Name"]
666
- for source in alt_name_sources:
667
- name_value = find_matching_json_value(source, flat_json)
668
- if name_value:
669
- new_name = get_value_as_string(name_value).strip()
670
- if new_name and "Pty Ltd" not in new_name and "Company" not in new_name and "Farming" not in new_name:
671
- name_cell.text = new_name
672
- replacements_made += 1
673
- print(f" βœ… Updated Print Name (alt): '{new_name}' from {source}")
674
- break
675
-
676
- # Try Position Title alternatives
677
- alt_position_sources = ["Position Title"]
678
- for source in alt_position_sources:
679
- position_value = find_matching_json_value(source, flat_json)
680
- if position_value:
681
- new_position = get_value_as_string(position_value).strip()
682
- if new_position:
683
- position_cell.text = new_position
684
- replacements_made += 1
685
- print(f" βœ… Updated Position Title (alt): '{new_position}' from {source}")
686
- break
687
  break
688
-
 
 
 
 
 
 
 
 
 
689
  return replacements_made
690
 
691
  def handle_multiple_red_segments_in_cell(cell, flat_json):
692
  """Handle multiple red text segments within a single cell"""
693
  replacements_made = 0
694
-
695
  red_segments = extract_red_text_segments(cell)
696
  if not red_segments:
697
  return 0
698
-
699
  # Try to match each segment individually
700
  for i, segment in enumerate(red_segments):
701
  segment_text = segment['text'].strip()
@@ -706,24 +722,24 @@ def handle_multiple_red_segments_in_cell(cell, flat_json):
706
  if replace_single_segment(segment, replacement_text):
707
  replacements_made += 1
708
  print(f" βœ… Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
709
-
710
  return replacements_made
711
 
712
  def handle_nature_business_multiline_fix(cell, flat_json):
713
  """Handle Nature of Business multiline red text"""
714
  replacements_made = 0
715
-
716
  # Extract red text to check if it looks like nature of business
717
  red_text = ""
718
  for paragraph in cell.paragraphs:
719
  for run in paragraph.runs:
720
  if is_red(run):
721
  red_text += run.text
722
-
723
  red_text = red_text.strip()
724
  if not red_text:
725
  return 0
726
-
727
  # Check if this looks like nature of business content
728
  nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
729
  if any(indicator in red_text.lower() for indicator in nature_indicators):
@@ -734,27 +750,27 @@ def handle_nature_business_multiline_fix(cell, flat_json):
734
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
735
  replacements_made += cell_replacements
736
  print(f" βœ… Fixed Nature of Business multiline content")
737
-
738
  return replacements_made
739
 
740
  def handle_management_summary_fix(cell, flat_json):
741
  """Handle Management Summary content fixes"""
742
  replacements_made = 0
743
-
744
  # Extract red text
745
  red_text = ""
746
  for paragraph in cell.paragraphs:
747
  for run in paragraph.runs:
748
  if is_red(run):
749
  red_text += run.text
750
-
751
  red_text = red_text.strip()
752
  if not red_text:
753
  return 0
754
-
755
  # Look for management summary data in new schema format
756
  management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
757
-
758
  for mgmt_type in management_types:
759
  if mgmt_type in flat_json:
760
  mgmt_data = flat_json[mgmt_type]
@@ -771,123 +787,43 @@ def handle_management_summary_fix(cell, flat_json):
771
  replacements_made += cell_replacements
772
  print(f" βœ… Fixed {mgmt_type} - {std_key}")
773
  return replacements_made
774
-
775
- return replacements_made
776
 
777
- def fix_operator_declaration_empty_values(table, flat_json):
778
- """Fix Operator Declaration table when values are empty or need updating"""
779
- replacements_made = 0
780
-
781
- print(f" 🎯 FIX: Operator Declaration empty values processing")
782
-
783
- # Check if this is an Operator Declaration table
784
- table_context = ""
785
- for row in table.rows:
786
- for cell in row.cells:
787
- table_context += get_clean_text(cell).lower() + " "
788
-
789
- if not ("print name" in table_context and "position title" in table_context):
790
- return 0
791
-
792
- print(f" βœ… Confirmed Operator Declaration table")
793
-
794
- # Find the data row with Print Name and Position Title
795
- for row_idx, row in enumerate(table.rows):
796
- if len(row.cells) >= 2:
797
- cell1_text = get_clean_text(row.cells[0]).strip().lower()
798
- cell2_text = get_clean_text(row.cells[1]).strip().lower()
799
-
800
- # Check if this is the header row
801
- if "print name" in cell1_text and "position" in cell2_text:
802
- print(f" πŸ“Œ Found header row at {row_idx + 1}")
803
-
804
- # Look for the data row (next row)
805
- if row_idx + 1 < len(table.rows):
806
- data_row = table.rows[row_idx + 1]
807
- if len(data_row.cells) >= 2:
808
- name_cell = data_row.cells[0]
809
- position_cell = data_row.cells[1]
810
-
811
- # Check if cells are empty or have red text
812
- name_text = get_clean_text(name_cell).strip()
813
- position_text = get_clean_text(position_cell).strip()
814
-
815
- print(f" πŸ“‹ Current values: Name='{name_text}', Position='{position_text}'")
816
-
817
- # FORCE UPDATE - try direct fields
818
- print(f" πŸ”§ FORCE updating Print Name")
819
- name_value = find_matching_json_value("Operator Declaration.Print Name", flat_json)
820
- if name_value:
821
- new_name = get_value_as_string(name_value).strip()
822
- if new_name and "Pty Ltd" not in new_name and "Company" not in new_name and "Farming" not in new_name:
823
- name_cell.text = new_name # FORCE replace
824
- replacements_made += 1
825
- print(f" βœ… FORCE Updated Print Name: '{name_text}' -> '{new_name}'")
826
-
827
- print(f" πŸ”§ FORCE updating Position Title")
828
- position_value = find_matching_json_value("Operator Declaration.Position Title", flat_json)
829
- if position_value:
830
- new_position = get_value_as_string(position_value).strip()
831
- if new_position:
832
- position_cell.text = new_position # FORCE replace
833
- replacements_made += 1
834
- print(f" βœ… FORCE Updated Position Title: '{position_text}' -> '{new_position}'")
835
-
836
- # If still no updates, try alternative sources
837
- if replacements_made == 0:
838
- print(f" πŸ”§ Trying alternative sources...")
839
-
840
- # Try Print Name alternatives
841
- alt_name_sources = ["Print Name"]
842
- for source in alt_name_sources:
843
- name_value = find_matching_json_value(source, flat_json)
844
- if name_value:
845
- new_name = get_value_as_string(name_value).strip()
846
- if new_name and "Pty Ltd" not in new_name and "Company" not in new_name and "Farming" not in new_name:
847
- name_cell.text = new_name
848
- replacements_made += 1
849
- print(f" βœ… Updated Print Name (alt): '{new_name}' from {source}")
850
- break
851
-
852
- # Try Position Title alternatives
853
- alt_position_sources = ["Position Title"]
854
- for source in alt_position_sources:
855
- position_value = find_matching_json_value(source, flat_json)
856
- if position_value:
857
- new_position = get_value_as_string(position_value).strip()
858
- if new_position:
859
- position_cell.text = new_position
860
- replacements_made += 1
861
- print(f" βœ… Updated Position Title (alt): '{new_position}' from {source}")
862
- break
863
- break
864
-
865
  return replacements_made
866
 
 
 
 
 
867
  def handle_operator_declaration_fix(table, flat_json):
868
  """Handle small Operator/Auditor Declaration tables - SKIP if already processed"""
869
  replacements_made = 0
870
-
 
 
 
 
 
 
871
  if len(table.rows) > 4: # Only process small tables
872
  return 0
873
-
874
  # Get table context
875
  table_text = ""
876
  for row in table.rows:
877
  for cell in row.cells:
878
  table_text += get_clean_text(cell).lower() + " "
879
-
880
  # SKIP if this is an Operator Declaration table (already handled by fix_operator_declaration_empty_values)
881
  if "print name" in table_text and "position title" in table_text:
882
  print(f" ⏭️ Skipping - Operator Declaration table already processed")
883
  return 0
884
-
885
  # Check if this is a declaration table
886
  if not ("print name" in table_text or "signature" in table_text or "date" in table_text):
887
  return 0
888
-
889
  print(f" 🎯 Processing other declaration table")
890
-
891
  # Process each cell with red text (for auditor declarations, etc.)
892
  for row_idx, row in enumerate(table.rows):
893
  for cell_idx, cell in enumerate(row.cells):
@@ -896,10 +832,10 @@ def handle_operator_declaration_fix(table, flat_json):
896
  declaration_fields = [
897
  "NHVAS Approved Auditor Declaration.Print Name",
898
  "Auditor name",
899
- "Signature",
900
  "Date"
901
  ]
902
-
903
  replaced = False
904
  for field in declaration_fields:
905
  field_value = find_matching_json_value(field, flat_json)
@@ -912,7 +848,7 @@ def handle_operator_declaration_fix(table, flat_json):
912
  print(f" βœ… Fixed declaration field: {field}")
913
  replaced = True
914
  break
915
-
916
  # If no specific field match, try generic signature/date
917
  if not replaced:
918
  red_text = ""
@@ -920,42 +856,49 @@ def handle_operator_declaration_fix(table, flat_json):
920
  for run in paragraph.runs:
921
  if is_red(run):
922
  red_text += run.text
923
-
924
  if "signature" in red_text.lower():
925
  cell_replacements = replace_red_text_in_cell(cell, "[Signature]")
926
  replacements_made += cell_replacements
927
  elif "date" in red_text.lower():
928
  cell_replacements = replace_red_text_in_cell(cell, "[Date]")
929
  replacements_made += cell_replacements
930
-
931
  return replacements_made
932
 
933
  def handle_print_accreditation_section(table, flat_json):
934
  """Handle Print Accreditation section - SKIP Operator Declaration tables"""
935
  replacements_made = 0
936
-
 
 
 
 
 
 
937
  # Get table context to check what type of table this is
938
  table_context = ""
939
  for row in table.rows:
940
  for cell in row.cells:
941
  table_context += get_clean_text(cell).lower() + " "
942
-
943
  # SKIP if this is an Operator Declaration table
944
  if "operator declaration" in table_context or ("print name" in table_context and "position title" in table_context):
945
  print(f" ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
946
  return 0
947
-
948
  print(f" πŸ“‹ Processing Print Accreditation section")
949
-
950
  for row_idx, row in enumerate(table.rows):
951
  for cell_idx, cell in enumerate(row.cells):
952
  if has_red_text(cell):
953
  # Try print accreditation fields
954
  accreditation_fields = [
955
  "(print accreditation name)",
956
- "Operator name (Legal entity)"
 
957
  ]
958
-
959
  for field in accreditation_fields:
960
  field_value = find_matching_json_value(field, flat_json)
961
  if field_value is not None:
@@ -966,43 +909,47 @@ def handle_print_accreditation_section(table, flat_json):
966
  if cell_replacements > 0:
967
  print(f" βœ… Fixed accreditation: {field}")
968
  break
969
-
970
  return replacements_made
971
 
972
  def process_single_column_sections(cell, key_text, flat_json):
973
  """Process single column sections with red text"""
974
  replacements_made = 0
975
-
976
  if has_red_text(cell):
977
  red_text = ""
978
  for paragraph in cell.paragraphs:
979
  for run in paragraph.runs:
980
  if is_red(run):
981
  red_text += run.text
982
-
983
  if red_text.strip():
984
  # Try direct matching first
985
  section_value = find_matching_json_value(red_text.strip(), flat_json)
986
  if section_value is None:
987
  # Try key-based matching
988
  section_value = find_matching_json_value(key_text, flat_json)
989
-
990
  if section_value is not None:
991
  section_replacement = get_value_as_string(section_value, red_text.strip())
992
  cell_replacements = replace_red_text_in_cell(cell, section_replacement)
993
  replacements_made += cell_replacements
994
  if cell_replacements > 0:
995
  print(f" βœ… Fixed single column section: '{key_text}'")
996
-
997
  return replacements_made
998
 
 
 
 
 
999
  def process_tables(document, flat_json):
1000
  """Process all tables in the document with comprehensive fixes"""
1001
  replacements_made = 0
1002
-
1003
  for table_idx, table in enumerate(document.tables):
1004
  print(f"\nπŸ” Processing table {table_idx + 1}:")
1005
-
1006
  # Get table context
1007
  table_text = ""
1008
  for row in table.rows[:3]:
@@ -1013,12 +960,12 @@ def process_tables(document, flat_json):
1013
  management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
1014
  has_management = any(indicator in table_text for indicator in management_summary_indicators)
1015
  has_details = "details" in table_text
1016
-
1017
  if has_management and has_details:
1018
  print(f" πŸ“‹ Detected Management Summary table")
1019
  summary_fixes = fix_management_summary_details_column(table, flat_json)
1020
  replacements_made += summary_fixes
1021
-
1022
  # Process remaining red text in management summary
1023
  summary_replacements = 0
1024
  for row_idx, row in enumerate(table.rows):
@@ -1031,10 +978,8 @@ def process_tables(document, flat_json):
1031
  if mgmt_type in flat_json:
1032
  mgmt_data = flat_json[mgmt_type]
1033
  if isinstance(mgmt_data, dict):
1034
- # Find matching standard
1035
  for std_key, std_value in mgmt_data.items():
1036
  if isinstance(std_value, list) and len(std_value) > 0:
1037
- # Check if red text matches this standard data
1038
  red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip()
1039
  for item in std_value:
1040
  if len(red_text) > 15 and red_text.lower() in str(item).lower():
@@ -1044,15 +989,14 @@ def process_tables(document, flat_json):
1044
  print(f" βœ… Updated {std_key} with summary data")
1045
  break
1046
  break
1047
-
1048
- # Fallback to existing method
1049
  if summary_replacements == 0:
1050
  cell_replacements = handle_management_summary_fix(cell, flat_json)
1051
  summary_replacements += cell_replacements
1052
-
1053
  replacements_made += summary_replacements
1054
  continue
1055
-
1056
  # Detect Vehicle Registration tables
1057
  vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
1058
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
@@ -1061,57 +1005,61 @@ def process_tables(document, flat_json):
1061
  vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
1062
  replacements_made += vehicle_replacements
1063
  continue
1064
-
1065
  # Detect Attendance List tables
1066
  if "attendance list" in table_text and "names and position titles" in table_text:
1067
  print(f" πŸ‘₯ Detected Attendance List table")
1068
  attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
1069
  replacements_made += attendance_replacements
1070
  continue
1071
-
1072
- # Detect Print Accreditation tables
1073
  print_accreditation_indicators = ["print name", "position title"]
1074
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
1075
- if indicator_count >= 1:
1076
- print(f" πŸ“‹ Detected Print Accreditation table")
1077
-
1078
- # Check for declaration tables that need fixing
1079
- if "print name" in table_text and "position" in table_text:
1080
- declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
1081
- replacements_made += declaration_fixes
1082
-
1083
- print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
1084
- replacements_made += print_accreditation_replacements
 
 
 
 
1085
  continue
1086
-
1087
- # Process regular table rows
1088
  for row_idx, row in enumerate(table.rows):
1089
  if len(row.cells) < 1:
1090
  continue
1091
-
1092
  key_cell = row.cells[0]
1093
  key_text = get_clean_text(key_cell)
1094
-
1095
  if not key_text:
1096
  continue
1097
-
1098
  print(f" πŸ“Œ Row {row_idx + 1}: Key = '{key_text}'")
1099
-
1100
  json_value = find_matching_json_value(key_text, flat_json)
1101
-
1102
  if json_value is not None:
1103
  replacement_text = get_value_as_string(json_value, key_text)
1104
-
1105
  # Handle Australian Company Number
1106
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
1107
  cell_replacements = handle_australian_company_number(row, json_value)
1108
  replacements_made += cell_replacements
1109
-
1110
  # Handle section headers
1111
  elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
1112
  print(f" βœ… Section header detected, checking next row...")
1113
  next_row = table.rows[row_idx + 1]
1114
-
1115
  for cell_idx, cell in enumerate(next_row.cells):
1116
  if has_red_text(cell):
1117
  print(f" βœ… Found red text in next row, cell {cell_idx + 1}")
@@ -1121,13 +1069,13 @@ def process_tables(document, flat_json):
1121
  replacements_made += cell_replacements
1122
  if cell_replacements > 0:
1123
  print(f" -> Replaced section content")
1124
-
1125
  # Handle single column sections
1126
  elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
1127
  if has_red_text(key_cell):
1128
  cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
1129
  replacements_made += cell_replacements
1130
-
1131
  # Handle regular key-value pairs
1132
  else:
1133
  for cell_idx in range(1, len(row.cells)):
@@ -1136,7 +1084,7 @@ def process_tables(document, flat_json):
1136
  print(f" βœ… Found red text in column {cell_idx + 1}")
1137
  cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
1138
  replacements_made += cell_replacements
1139
-
1140
  else:
1141
  # Fallback processing for unmatched keys
1142
  if len(row.cells) == 1 and has_red_text(key_cell):
@@ -1151,52 +1099,55 @@ def process_tables(document, flat_json):
1151
  section_replacement = get_value_as_string(section_value, red_text.strip())
1152
  cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
1153
  replacements_made += cell_replacements
1154
-
1155
  # Process red text in all cells
1156
  for cell_idx in range(len(row.cells)):
1157
  cell = row.cells[cell_idx]
1158
  if has_red_text(cell):
1159
  cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
1160
  replacements_made += cell_replacements
1161
-
1162
  # Apply fixes if no replacements made
1163
  if cell_replacements == 0:
1164
  surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
1165
  replacements_made += surgical_fix
1166
-
1167
  if cell_replacements == 0:
1168
  management_summary_fix = handle_management_summary_fix(cell, flat_json)
1169
  replacements_made += management_summary_fix
1170
-
1171
  # Handle Operator/Auditor Declaration tables (check last few tables)
1172
  print(f"\n🎯 Final check for Declaration tables...")
1173
  for table in document.tables[-3:]:
1174
  if len(table.rows) <= 4:
 
 
 
1175
  declaration_fix = handle_operator_declaration_fix(table, flat_json)
1176
  replacements_made += declaration_fix
1177
-
1178
  return replacements_made
1179
 
1180
  def process_paragraphs(document, flat_json):
1181
  """Process all paragraphs in the document"""
1182
  replacements_made = 0
1183
  print(f"\nπŸ” Processing paragraphs:")
1184
-
1185
  for para_idx, paragraph in enumerate(document.paragraphs):
1186
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
1187
  if red_runs:
1188
  red_text_only = "".join(run.text for run in red_runs).strip()
1189
  print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
1190
-
1191
  json_value = find_matching_json_value(red_text_only, flat_json)
1192
-
1193
  if json_value is None:
1194
  # Enhanced pattern matching for signatures and dates
1195
  if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
1196
  json_value = find_matching_json_value("auditor signature", flat_json)
1197
  elif "OPERATOR SIGNATURE" in red_text_only.upper():
1198
  json_value = find_matching_json_value("operator signature", flat_json)
1199
-
1200
  if json_value is not None:
1201
  replacement_text = get_value_as_string(json_value)
1202
  print(f" βœ… Replacing red text with: '{replacement_text}'")
@@ -1205,22 +1156,22 @@ def process_paragraphs(document, flat_json):
1205
  for run in red_runs[1:]:
1206
  run.text = ''
1207
  replacements_made += 1
1208
-
1209
  return replacements_made
1210
 
1211
  def process_headings(document, flat_json):
1212
  """Process headings and their related content"""
1213
  replacements_made = 0
1214
  print(f"\nπŸ” Processing headings:")
1215
-
1216
  paragraphs = document.paragraphs
1217
-
1218
  for para_idx, paragraph in enumerate(paragraphs):
1219
  paragraph_text = paragraph.text.strip()
1220
-
1221
  if not paragraph_text:
1222
  continue
1223
-
1224
  # Check if this is a heading
1225
  matched_heading = None
1226
  for category, patterns in HEADING_PATTERNS.items():
@@ -1230,28 +1181,28 @@ def process_headings(document, flat_json):
1230
  break
1231
  if matched_heading:
1232
  break
1233
-
1234
  if matched_heading:
1235
  print(f" πŸ“Œ Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
1236
-
1237
  # Check current heading paragraph
1238
  if has_red_text_in_paragraph(paragraph):
1239
  print(f" πŸ”΄ Found red text in heading itself")
1240
  heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
1241
  replacements_made += heading_replacements
1242
-
1243
  # Look ahead for related content
1244
  for next_para_offset in range(1, 6):
1245
  next_para_idx = para_idx + next_para_offset
1246
  if next_para_idx >= len(paragraphs):
1247
  break
1248
-
1249
  next_paragraph = paragraphs[next_para_idx]
1250
  next_text = next_paragraph.text.strip()
1251
-
1252
  if not next_text:
1253
  continue
1254
-
1255
  # Stop if we hit another heading
1256
  is_another_heading = False
1257
  for category, patterns in HEADING_PATTERNS.items():
@@ -1261,43 +1212,43 @@ def process_headings(document, flat_json):
1261
  break
1262
  if is_another_heading:
1263
  break
1264
-
1265
  if is_another_heading:
1266
  break
1267
-
1268
  # Process red text with context
1269
  if has_red_text_in_paragraph(next_paragraph):
1270
  print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading")
1271
-
1272
  context_replacements = process_red_text_in_paragraph(
1273
- next_paragraph,
1274
  paragraph_text,
1275
  flat_json
1276
  )
1277
  replacements_made += context_replacements
1278
-
1279
  return replacements_made
1280
 
1281
  def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1282
  """Process red text within a paragraph using context"""
1283
  replacements_made = 0
1284
-
1285
  red_text_segments = []
1286
  for run in paragraph.runs:
1287
  if is_red(run) and run.text.strip():
1288
  red_text_segments.append(run.text.strip())
1289
-
1290
  if not red_text_segments:
1291
  return 0
1292
-
1293
  combined_red_text = " ".join(red_text_segments).strip()
1294
  print(f" πŸ” Red text found: '{combined_red_text}'")
1295
-
1296
  json_value = None
1297
-
1298
  # Direct matching
1299
  json_value = find_matching_json_value(combined_red_text, flat_json)
1300
-
1301
  # Context-based matching
1302
  if json_value is None:
1303
  if "NHVAS APPROVED AUDITOR" in context_text.upper():
@@ -1307,7 +1258,7 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1307
  if json_value is not None:
1308
  print(f" βœ… Found auditor match with field: '{field}'")
1309
  break
1310
-
1311
  elif "OPERATOR DECLARATION" in context_text.upper():
1312
  operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
1313
  for field in operator_fields:
@@ -1315,7 +1266,7 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1315
  if json_value is not None:
1316
  print(f" βœ… Found operator match with field: '{field}'")
1317
  break
1318
-
1319
  # Combined context queries
1320
  if json_value is None:
1321
  context_queries = [
@@ -1323,98 +1274,107 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1323
  combined_red_text,
1324
  context_text
1325
  ]
1326
-
1327
  for query in context_queries:
1328
  json_value = find_matching_json_value(query, flat_json)
1329
  if json_value is not None:
1330
  print(f" βœ… Found match with combined query")
1331
  break
1332
-
1333
  # Replace if match found
1334
  if json_value is not None:
1335
  replacement_text = get_value_as_string(json_value, combined_red_text)
1336
-
1337
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
1338
  if red_runs:
1339
  red_runs[0].text = replacement_text
1340
  red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
1341
-
1342
  for run in red_runs[1:]:
1343
  run.text = ''
1344
-
1345
  replacements_made = 1
1346
  print(f" βœ… Replaced with: '{replacement_text}'")
1347
  else:
1348
  print(f" ❌ No match found for red text: '{combined_red_text}'")
1349
-
1350
  return replacements_made
1351
 
1352
  def force_red_text_replacement(document, flat_json):
1353
  """Force replacement of any remaining red text by trying ALL JSON values"""
1354
  replacements_made = 0
1355
  print(f"\n🎯 FORCE FIX: Scanning for any remaining red text...")
1356
-
1357
  # Collect all possible replacement values from JSON
1358
  all_values = {}
1359
  for key, value in flat_json.items():
1360
  if value:
1361
  value_str = get_value_as_string(value, key)
1362
-
1363
  if value_str and isinstance(value_str, str) and value_str.strip():
1364
  all_values[key] = value_str.strip()
1365
-
1366
  # Store individual items from lists for partial matching
1367
  if isinstance(value, list):
1368
  for i, item in enumerate(value):
1369
  item_str = str(item).strip() if item else ""
1370
  if item_str:
1371
  all_values[f"{key}_item_{i}"] = item_str
1372
-
1373
  print(f" Found {len(all_values)} potential replacement values")
1374
-
1375
  # Process all tables
1376
  for table_idx, table in enumerate(document.tables):
1377
  for row_idx, row in enumerate(table.rows):
1378
  for cell_idx, cell in enumerate(row.cells):
1379
  if has_red_text(cell):
1380
  print(f" πŸ” Found red text in Table {table_idx + 1}, Row {row_idx + 1}, Cell {cell_idx + 1}")
1381
-
1382
  # Extract all red text from this cell
1383
  red_text_parts = []
1384
  for paragraph in cell.paragraphs:
1385
  for run in paragraph.runs:
1386
  if is_red(run) and run.text.strip():
1387
  red_text_parts.append(run.text.strip())
1388
-
1389
  combined_red_text = " ".join(red_text_parts).strip()
1390
  print(f" Red text: '{combined_red_text}'")
1391
-
 
 
 
1392
  # Find best match
1393
  best_match = None
1394
  best_key = None
1395
-
1396
- # Exact matching
1397
  for key, value in all_values.items():
1398
  if combined_red_text.lower() == value.lower():
1399
  best_match = value
1400
  best_key = key
1401
  break
1402
-
1403
- # Partial matching
1404
  if not best_match:
1405
  for key, value in all_values.items():
 
 
 
1406
  if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
1407
  (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
1408
  best_match = value
1409
  best_key = key
1410
  break
1411
-
1412
  # Word-by-word matching for names/dates
1413
  if not best_match:
1414
  red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
1415
  best_score = 0
1416
-
1417
  for key, value in all_values.items():
 
 
 
1418
  value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
1419
  if red_words and value_words:
1420
  common_words = red_words.intersection(value_words)
@@ -1424,7 +1384,7 @@ def force_red_text_replacement(document, flat_json):
1424
  best_score = score
1425
  best_match = value
1426
  best_key = key
1427
-
1428
  # Replace if we found a match
1429
  if best_match:
1430
  print(f" βœ… Replacing with: '{best_match}' (from key: '{best_key}')")
@@ -1433,7 +1393,7 @@ def force_red_text_replacement(document, flat_json):
1433
  print(f" Made {cell_replacements} replacements")
1434
  else:
1435
  print(f" ❌ No suitable replacement found")
1436
-
1437
  # Process all paragraphs
1438
  for para_idx, paragraph in enumerate(document.paragraphs):
1439
  if has_red_text_in_paragraph(paragraph):
@@ -1441,37 +1401,43 @@ def force_red_text_replacement(document, flat_json):
1441
  for run in paragraph.runs:
1442
  if is_red(run) and run.text.strip():
1443
  red_text_parts.append(run.text.strip())
1444
-
1445
  combined_red_text = " ".join(red_text_parts).strip()
1446
  if combined_red_text:
1447
  print(f" πŸ” Found red text in Paragraph {para_idx + 1}: '{combined_red_text}'")
1448
-
1449
  # Same matching logic as above
1450
  best_match = None
1451
  best_key = None
1452
-
 
 
1453
  # Exact match
1454
  for key, value in all_values.items():
1455
  if combined_red_text.lower() == value.lower():
1456
  best_match = value
1457
  best_key = key
1458
  break
1459
-
1460
  # Partial match
1461
  if not best_match:
1462
  for key, value in all_values.items():
 
 
1463
  if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
1464
  (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
1465
  best_match = value
1466
  best_key = key
1467
  break
1468
-
1469
  # Word match
1470
  if not best_match:
1471
  red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
1472
  best_score = 0
1473
-
1474
  for key, value in all_values.items():
 
 
1475
  value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
1476
  if red_words and value_words:
1477
  common_words = red_words.intersection(value_words)
@@ -1481,7 +1447,7 @@ def force_red_text_replacement(document, flat_json):
1481
  best_score = score
1482
  best_match = value
1483
  best_key = key
1484
-
1485
  # Replace if found
1486
  if best_match:
1487
  print(f" βœ… Replacing with: '{best_match}' (from key: '{best_key}')")
@@ -1495,7 +1461,7 @@ def force_red_text_replacement(document, flat_json):
1495
  print(f" Made 1 paragraph replacement")
1496
  else:
1497
  print(f" ❌ No suitable replacement found")
1498
-
1499
  return replacements_made
1500
 
1501
  def process_hf(json_file, docx_file, output_file):
@@ -1507,7 +1473,7 @@ def process_hf(json_file, docx_file, output_file):
1507
  else:
1508
  with open(json_file, 'r', encoding='utf-8') as f:
1509
  json_data = json.load(f)
1510
-
1511
  flat_json = flatten_json(json_data)
1512
  print("πŸ“„ Available JSON keys (sample):")
1513
  for i, (key, value) in enumerate(sorted(flat_json.items())):
@@ -1523,14 +1489,14 @@ def process_hf(json_file, docx_file, output_file):
1523
 
1524
  # Process document with all fixes
1525
  print("πŸš€ Starting comprehensive document processing...")
1526
-
1527
  table_replacements = process_tables(doc, flat_json)
1528
  paragraph_replacements = process_paragraphs(doc, flat_json)
1529
  heading_replacements = process_headings(doc, flat_json)
1530
-
1531
  # Final force fix for any remaining red text
1532
  force_replacements = force_red_text_replacement(doc, flat_json)
1533
-
1534
  total_replacements = table_replacements + paragraph_replacements + heading_replacements + force_replacements
1535
 
1536
  # Save output
@@ -1538,7 +1504,7 @@ def process_hf(json_file, docx_file, output_file):
1538
  doc.save(output_file)
1539
  else:
1540
  doc.save(output_file)
1541
-
1542
  print(f"\nβœ… Document saved as: {output_file}")
1543
  print(f"βœ… Total replacements: {total_replacements}")
1544
  print(f" πŸ“Š Tables: {table_replacements}")
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Updated pipeline.py
4
+ Merged improvements:
5
+ - removed duplicate functions
6
+ - table processed-marker to avoid multiple handlers clobbering the same table
7
+ - stricter detection of print-accreditation/operator-declaration tables
8
+ - safer force replacement (avoid short->long mapping)
9
+ - prefer exact qualified keys for Print Name / Position Title lookups
10
+ - preserved all other logic and prints/logging
11
+ """
12
+
13
  import json
14
  from docx import Document
15
  from docx.shared import RGBColor
16
  import re
17
+ from typing import Any
18
 
19
  # Heading patterns for document structure detection
20
  HEADING_PATTERNS = {
 
45
  # ============================================================================
46
 
47
  def load_json(filepath):
48
+ with open(filepath, 'r', encoding='utf-8') as file:
49
  return json.load(file)
50
 
51
  def flatten_json(y, prefix=''):
 
61
 
62
  def is_red(run):
63
  color = run.font.color
64
+ # safe checks, handle theme_color fallback as before
65
+ try:
66
+ return color and (getattr(color, "rgb", None) and color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
67
+ except Exception:
68
+ # best-effort: If object doesn't match expected shape, return False
69
+ return False
70
 
71
  def get_value_as_string(value, field_name=""):
72
  if isinstance(value, list):
 
108
 
109
  def find_matching_json_value(field_name, flat_json):
110
  """Find matching value in JSON with multiple strategies"""
111
+ field_name = (field_name or "").strip()
112
+ if not field_name:
113
+ return None
114
+
115
  # Try exact match first
116
  if field_name in flat_json:
117
  print(f" βœ… Direct match found for key '{field_name}'")
118
  return flat_json[field_name]
119
+
120
  # Try case-insensitive exact match
121
  for key, value in flat_json.items():
122
  if key.lower() == field_name.lower():
123
  print(f" βœ… Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
124
  return value
125
+
126
+ # Better Print Name detection for operator vs auditor (prefer fully-qualified keys)
127
  if field_name.lower().strip() == "print name":
128
  operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
129
  auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
130
+
131
  if operator_keys:
132
  print(f" βœ… Operator Print Name match: '{field_name}' -> '{operator_keys[0]}'")
133
  return flat_json[operator_keys[0]]
134
  elif auditor_keys:
135
  print(f" βœ… Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
136
  return flat_json[auditor_keys[0]]
137
+
138
  # Try suffix matching (for nested keys like "section.field")
139
  for key, value in flat_json.items():
140
  if '.' in key and key.split('.')[-1].lower() == field_name.lower():
141
  print(f" βœ… Suffix match found for key '{field_name}' with JSON key '{key}'")
142
  return value
143
+
144
+ # Clean and exact match attempt
145
  clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
146
  clean_field = re.sub(r'\s+', ' ', clean_field)
 
147
  for key, value in flat_json.items():
148
  clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
149
  clean_key = re.sub(r'\s+', ' ', clean_key)
 
150
  if clean_field == clean_key:
151
  print(f" βœ… Clean match found for key '{field_name}' with JSON key '{key}'")
152
  return value
153
+
154
  # Enhanced fuzzy matching with better scoring
155
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
156
  if not field_words:
157
  return None
158
+
159
  best_match = None
160
  best_score = 0
161
  best_key = None
162
+
163
  for key, value in flat_json.items():
164
  key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
165
  if not key_words:
166
  continue
167
+
168
+ # Calculate similarity score: Jaccard + coverage
169
  common_words = field_words.intersection(key_words)
170
  if common_words:
 
171
  similarity = len(common_words) / len(field_words.union(key_words))
 
 
172
  coverage = len(common_words) / len(field_words)
173
  final_score = (similarity * 0.6) + (coverage * 0.4)
174
+
175
  if final_score > best_score:
176
  best_score = final_score
177
  best_match = value
178
  best_key = key
179
+
180
  if best_match and best_score >= 0.25:
181
  print(f" βœ… Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
182
  return best_match
183
+
184
  print(f" ❌ No match found for '{field_name}'")
185
  return None
186
 
 
191
  def extract_red_text_segments(cell):
192
  """Extract red text segments from a cell"""
193
  red_segments = []
194
+
195
  for para_idx, paragraph in enumerate(cell.paragraphs):
196
  current_segment = ""
197
  segment_runs = []
198
+
199
  for run_idx, run in enumerate(paragraph.runs):
200
  if is_red(run):
201
  if run.text:
 
211
  })
212
  current_segment = ""
213
  segment_runs = []
214
+
215
  # Handle segment at end of paragraph
216
  if segment_runs:
217
  red_segments.append({
 
219
  'runs': segment_runs.copy(),
220
  'paragraph_idx': para_idx
221
  })
222
+
223
  return red_segments
224
 
225
  def replace_all_red_segments(red_segments, replacement_text):
226
  """Replace all red segments with replacement text"""
227
  if not red_segments:
228
  return 0
229
+
230
  if '\n' in replacement_text:
231
  replacement_lines = replacement_text.split('\n')
232
  else:
233
  replacement_lines = [replacement_text]
234
+
235
  replacements_made = 0
236
+
237
  if red_segments and replacement_lines:
238
  first_segment = red_segments[0]
239
  if first_segment['runs']:
 
241
  first_run.text = replacement_lines[0]
242
  first_run.font.color.rgb = RGBColor(0, 0, 0)
243
  replacements_made = 1
244
+
245
  for _, _, run in first_segment['runs'][1:]:
246
  run.text = ''
247
+
248
  for segment in red_segments[1:]:
249
  for _, _, run in segment['runs']:
250
  run.text = ''
251
+
252
  if len(replacement_lines) > 1 and red_segments:
253
  try:
254
  first_run = red_segments[0]['runs'][0][2]
255
  paragraph = first_run.element.getparent()
256
+ # Add line breaks + new runs (best-effort)
257
+ from docx.oxml import OxmlElement
258
+ parent = first_run.element.getparent()
259
  for line in replacement_lines[1:]:
260
  if line.strip():
 
261
  br = OxmlElement('w:br')
262
  first_run.element.append(br)
263
+ # create a new run in the same paragraph node (docx high-level API)
264
  new_run = paragraph.add_run(line.strip())
265
  new_run.font.color.rgb = RGBColor(0, 0, 0)
266
+ except Exception:
267
  if red_segments and red_segments[0]['runs']:
268
  first_run = red_segments[0]['runs'][0][2]
269
  first_run.text = ' '.join(replacement_lines)
270
  first_run.font.color.rgb = RGBColor(0, 0, 0)
271
+
272
  return replacements_made
273
 
274
  def replace_single_segment(segment, replacement_text):
275
  """Replace a single red text segment"""
276
  if not segment['runs']:
277
  return False
278
+
279
  first_run = segment['runs'][0][2]
280
  first_run.text = replacement_text
281
  first_run.font.color.rgb = RGBColor(0, 0, 0)
282
+
283
  for _, _, run in segment['runs'][1:]:
284
  run.text = ''
285
+
286
  return True
287
 
288
  def replace_red_text_in_cell(cell, replacement_text):
289
  """Replace red text in a cell with replacement text"""
290
  red_segments = extract_red_text_segments(cell)
291
+
292
  if not red_segments:
293
  return 0
294
+
295
  return replace_all_red_segments(red_segments, replacement_text)
296
 
297
  # ============================================================================
 
314
  def handle_vehicle_registration_table(table, flat_json):
315
  """Handle vehicle registration table data replacement"""
316
  replacements_made = 0
317
+
318
  # Try to find vehicle registration data
319
  vehicle_section = None
320
+
321
  for key, value in flat_json.items():
322
  if "vehicle registration numbers of records examined" in key.lower():
323
  if isinstance(value, dict):
324
  vehicle_section = value
325
  print(f" βœ… Found vehicle data in key: '{key}'")
326
  break
327
+
328
  if not vehicle_section:
329
  potential_columns = {}
330
  for key, value in flat_json.items():
331
+ if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension", "trip records", "suspension"]):
332
  if "." in key:
333
  column_name = key.split(".")[-1]
334
  else:
335
  column_name = key
336
  potential_columns[column_name] = value
337
+
338
  if potential_columns:
339
  vehicle_section = potential_columns
340
  print(f" βœ… Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
341
  else:
342
  print(f" ❌ Vehicle registration data not found in JSON")
343
  return 0
344
+
345
  print(f" βœ… Found vehicle registration data with {len(vehicle_section)} columns")
346
+
347
  # Find header row
348
  header_row_idx = -1
349
  header_row = None
350
+
351
  for row_idx, row in enumerate(table.rows):
352
  row_text = "".join(get_clean_text(cell).lower() for cell in row.cells)
353
  if "registration" in row_text and "number" in row_text:
354
  header_row_idx = row_idx
355
  header_row = row
356
  break
357
+
358
  if header_row_idx == -1:
359
  print(f" ❌ Could not find header row in vehicle table")
360
  return 0
361
+
362
  print(f" βœ… Found header row at index {header_row_idx}")
363
+
364
+ # Enhanced column mapping (same method as before)
365
  column_mapping = {}
366
  for col_idx, cell in enumerate(header_row.cells):
367
  header_text = get_clean_text(cell).strip()
368
  if not header_text or header_text.lower() == "no.":
369
  continue
370
+
371
  best_match = None
372
  best_score = 0
373
+
374
  normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
375
+
376
  for json_key in vehicle_section.keys():
377
  normalized_json = json_key.lower().strip()
378
+
379
  if normalized_header == normalized_json:
380
  best_match = json_key
381
  best_score = 1.0
382
  break
383
+
384
  header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
385
  json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
386
+
387
  if header_words and json_words:
388
  common_words = header_words.intersection(json_words)
389
  score = len(common_words) / max(len(header_words), len(json_words))
390
+
391
  if score > best_score and score >= 0.3:
392
  best_score = score
393
  best_match = json_key
394
+
395
  header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
396
  json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
397
+
398
  if header_clean in json_clean or json_clean in header_clean:
399
  if len(header_clean) > 5 and len(json_clean) > 5:
400
  substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
401
  if substring_score > best_score and substring_score >= 0.6:
402
  best_score = substring_score
403
  best_match = json_key
404
+
405
  if best_match:
406
  column_mapping[col_idx] = best_match
407
  print(f" πŸ“Œ Column {col_idx + 1} ('{header_text}') -> '{best_match}' (score: {best_score:.2f})")
408
+
409
  if not column_mapping:
410
  print(f" ❌ No column mappings found")
411
  return 0
412
+
413
  # Determine data rows needed
414
  max_data_rows = 0
415
  for json_key, data in vehicle_section.items():
416
  if isinstance(data, list):
417
  max_data_rows = max(max_data_rows, len(data))
418
+
419
  print(f" πŸ“Œ Need to populate {max_data_rows} data rows")
420
+
421
  # Process data rows
422
  for data_row_index in range(max_data_rows):
423
  table_row_idx = header_row_idx + 1 + data_row_index
424
+
425
  if table_row_idx >= len(table.rows):
426
  print(f" ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
427
  print(f" βž• Adding new row for vehicle {data_row_index + 1}")
428
+
429
  new_row = table.add_row()
430
  print(f" βœ… Successfully added row {len(table.rows)} to the table")
431
+
432
  row = table.rows[table_row_idx]
433
  print(f" πŸ“Œ Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
434
+
435
  for col_idx, json_key in column_mapping.items():
436
  if col_idx < len(row.cells):
437
  cell = row.cells[col_idx]
438
+
439
  column_data = vehicle_section.get(json_key, [])
440
  if isinstance(column_data, list) and data_row_index < len(column_data):
441
  replacement_value = str(column_data[data_row_index])
442
+
443
  cell_text = get_clean_text(cell)
444
  if has_red_text(cell) or not cell_text.strip():
445
  if not cell_text.strip():
 
451
  replacements_made += cell_replacements
452
  if cell_replacements > 0:
453
  print(f" -> Replaced red text with '{replacement_value}' (column '{json_key}')")
454
+
455
  return replacements_made
456
 
457
  def handle_attendance_list_table_enhanced(table, flat_json):
458
  """Enhanced Attendance List processing with better detection"""
459
  replacements_made = 0
460
+
461
  # Check multiple patterns for attendance list
462
  attendance_patterns = [
463
  "attendance list",
464
  "names and position titles",
465
  "attendees"
466
  ]
467
+
468
  # Scan all cells in the first few rows for attendance list indicators
469
  found_attendance_row = None
470
+
471
  for row_idx, row in enumerate(table.rows[:3]): # Check first 3 rows
472
  for cell_idx, cell in enumerate(row.cells):
473
  cell_text = get_clean_text(cell).lower()
474
+
475
  # Check if this cell contains attendance list header
476
  if any(pattern in cell_text for pattern in attendance_patterns):
477
  found_attendance_row = row_idx
478
  print(f" 🎯 ENHANCED: Found Attendance List in row {row_idx + 1}, cell {cell_idx + 1}")
479
  break
480
+
481
  if found_attendance_row is not None:
482
  break
483
+
484
  if found_attendance_row is None:
485
  return 0
486
+
487
  # Look for attendance data in JSON
488
  attendance_value = None
489
  attendance_search_keys = [
 
492
  "attendance list",
493
  "attendees"
494
  ]
495
+
496
  print(f" πŸ” Searching for attendance data in JSON...")
497
+
498
  for search_key in attendance_search_keys:
499
  attendance_value = find_matching_json_value(search_key, flat_json)
500
  if attendance_value is not None:
501
  print(f" βœ… Found attendance data with key: '{search_key}'")
502
  print(f" πŸ“Š Raw value: {attendance_value}")
503
  break
504
+
505
  if attendance_value is None:
506
  print(f" ❌ No attendance data found in JSON")
507
  return 0
508
+
509
  # Look for red text in ALL cells of the table
510
  target_cell = None
511
+
512
  print(f" πŸ” Scanning ALL cells in attendance table for red text...")
513
+
514
  for row_idx, row in enumerate(table.rows):
515
  for cell_idx, cell in enumerate(row.cells):
516
  if has_red_text(cell):
517
  print(f" 🎯 Found red text in row {row_idx + 1}, cell {cell_idx + 1}")
518
+
519
  # Get the red text to see if it looks like attendance data
520
  red_text = ""
521
  for paragraph in cell.paragraphs:
522
  for run in paragraph.runs:
523
  if is_red(run):
524
  red_text += run.text
525
+
526
  print(f" πŸ“‹ Red text content: '{red_text[:50]}...'")
527
+
528
  # Check if this red text looks like attendance data (contains names/manager/etc)
529
  red_text_lower = red_text.lower()
530
  if any(indicator in red_text_lower for indicator in ['manager', 'herbig', 'palin', '–', '-']):
531
  target_cell = cell
532
  print(f" βœ… This looks like attendance data - using this cell")
533
  break
534
+
535
  if target_cell is not None:
536
  break
537
+
538
  # If no red text found that looks like attendance data, return
539
  if target_cell is None:
540
  print(f" ⚠️ No red text found that looks like attendance data")
541
  return 0
542
+
543
  # Replace red text with properly formatted attendance list
544
  if has_red_text(target_cell):
545
  print(f" πŸ”§ Replacing red text with properly formatted attendance list...")
546
+
547
  # Ensure attendance_value is a list
548
  if isinstance(attendance_value, list):
549
  attendance_list = [str(item).strip() for item in attendance_value if str(item).strip()]
550
  else:
551
  attendance_list = [str(attendance_value).strip()]
552
+
553
  print(f" πŸ“ Attendance items to add:")
554
  for i, item in enumerate(attendance_list):
555
  print(f" {i+1}. {item}")
556
+
557
  # Replace with line-separated attendance list
558
  replacement_text = "\n".join(attendance_list)
559
  cell_replacements = replace_red_text_in_cell(target_cell, replacement_text)
560
  replacements_made += cell_replacements
561
+
562
  print(f" βœ… Added {len(attendance_list)} attendance items")
563
  print(f" πŸ“Š Replacements made: {cell_replacements}")
564
+
565
  return replacements_made
566
 
567
  def fix_management_summary_details_column(table, flat_json):
568
  """Fix the DETAILS column in Management Summary table"""
569
  replacements_made = 0
570
+
571
  print(f" 🎯 FIX: Management Summary DETAILS column processing")
572
+
573
  # Check if this is a Management Summary table
574
  table_text = ""
575
  for row in table.rows[:2]:
576
  for cell in row.cells:
577
  table_text += get_clean_text(cell).lower() + " "
578
+
579
  if not ("mass management" in table_text and "details" in table_text):
580
  return 0
581
+
582
  print(f" βœ… Confirmed Mass Management Summary table")
583
+
584
  # Process each row looking for Std 5. and Std 6. with red text
585
  for row_idx, row in enumerate(table.rows):
586
  if len(row.cells) >= 2:
587
  standard_cell = row.cells[0]
588
  details_cell = row.cells[1]
589
+
590
  standard_text = get_clean_text(standard_cell).strip()
591
+
592
  # Look for Std 5. Verification and Std 6. Internal Review specifically
593
  if "Std 5." in standard_text and "Verification" in standard_text:
594
  if has_red_text(details_cell):
595
  print(f" πŸ” Found Std 5. Verification with red text")
596
+
597
  json_value = find_matching_json_value("Std 5. Verification", flat_json)
598
  if json_value is not None:
599
  replacement_text = get_value_as_string(json_value, "Std 5. Verification")
600
  cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
601
  replacements_made += cell_replacements
602
  print(f" βœ… Replaced Std 5. Verification details")
603
+
604
  elif "Std 6." in standard_text and "Internal Review" in standard_text:
605
  if has_red_text(details_cell):
606
  print(f" πŸ” Found Std 6. Internal Review with red text")
607
+
608
  json_value = find_matching_json_value("Std 6. Internal Review", flat_json)
609
  if json_value is not None:
610
  replacement_text = get_value_as_string(json_value, "Std 6. Internal Review")
611
  cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
612
  replacements_made += cell_replacements
613
  print(f" βœ… Replaced Std 6. Internal Review details")
614
+
615
  return replacements_made
616
 
617
+ # ========================================================================
618
+ # IMPORTANT: Single canonical definition for Operator Declaration fixer
619
+ # ========================================================================
620
+
621
  def fix_operator_declaration_empty_values(table, flat_json):
622
  """Fix Operator Declaration table when values are empty or need updating"""
623
  replacements_made = 0
624
+
625
  print(f" 🎯 FIX: Operator Declaration empty values processing")
626
+
627
  # Check if this is an Operator Declaration table
628
  table_context = ""
629
  for row in table.rows:
630
  for cell in row.cells:
631
  table_context += get_clean_text(cell).lower() + " "
632
+
633
  if not ("print name" in table_context and "position title" in table_context):
634
  return 0
635
+
636
  print(f" βœ… Confirmed Operator Declaration table")
637
+
638
  # Find the data row with Print Name and Position Title
639
  for row_idx, row in enumerate(table.rows):
640
  if len(row.cells) >= 2:
641
  cell1_text = get_clean_text(row.cells[0]).strip().lower()
642
  cell2_text = get_clean_text(row.cells[1]).strip().lower()
643
+
644
  # Check if this is the header row
645
  if "print name" in cell1_text and "position" in cell2_text:
646
  print(f" πŸ“Œ Found header row at {row_idx + 1}")
647
+
648
  # Look for the data row (next row)
649
  if row_idx + 1 < len(table.rows):
650
  data_row = table.rows[row_idx + 1]
651
  if len(data_row.cells) >= 2:
652
  name_cell = data_row.cells[0]
653
  position_cell = data_row.cells[1]
654
+
655
  # Check if cells are empty or have red text
656
  name_text = get_clean_text(name_cell).strip()
657
  position_text = get_clean_text(position_cell).strip()
658
+
659
  print(f" πŸ“‹ Current values: Name='{name_text}', Position='{position_text}'")
660
+
661
+ # FORCE UPDATE - prefer fully qualified keys first (exact)
662
+ print(f" πŸ”§ FORCE updating Print Name (exact-key first)")
663
  name_value = find_matching_json_value("Operator Declaration.Print Name", flat_json)
664
+ if name_value is None:
665
+ # fallback to common alternatives
666
+ name_value = find_matching_json_value("Print Name", flat_json)
667
  if name_value:
668
  new_name = get_value_as_string(name_value).strip()
669
  if new_name and "Pty Ltd" not in new_name and "Company" not in new_name and "Farming" not in new_name:
670
+ # attempt targeted replacement: if red exists, replace red, else set text
671
+ if has_red_text(name_cell):
672
+ replace_red_text_in_cell(name_cell, new_name)
673
+ else:
674
+ name_cell.text = new_name
675
  replacements_made += 1
676
  print(f" βœ… FORCE Updated Print Name: '{name_text}' -> '{new_name}'")
677
+
678
+ print(f" πŸ”§ FORCE updating Position Title (exact-key first)")
679
  position_value = find_matching_json_value("Operator Declaration.Position Title", flat_json)
680
+ if position_value is None:
681
+ position_value = find_matching_json_value("Position Title", flat_json)
682
  if position_value:
683
  new_position = get_value_as_string(position_value).strip()
684
  if new_position:
685
+ if has_red_text(position_cell):
686
+ replace_red_text_in_cell(position_cell, new_position)
687
+ else:
688
+ position_cell.text = new_position
689
  replacements_made += 1
690
  print(f" βœ… FORCE Updated Position Title: '{position_text}' -> '{new_position}'")
691
+
692
+ # If still no updates, try alternative sources (already covered via fallback above)
693
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694
  break
695
+
696
+ # <<< PATCH: mark table processed so other handlers skip it
697
+ if replacements_made > 0:
698
+ try:
699
+ setattr(table, "_processed_operator_declaration", True)
700
+ print(" πŸ”– Marked table as processed by Operator Declaration handler")
701
+ except Exception:
702
+ pass
703
+ # <<< END PATCH
704
+
705
  return replacements_made
706
 
707
  def handle_multiple_red_segments_in_cell(cell, flat_json):
708
  """Handle multiple red text segments within a single cell"""
709
  replacements_made = 0
710
+
711
  red_segments = extract_red_text_segments(cell)
712
  if not red_segments:
713
  return 0
714
+
715
  # Try to match each segment individually
716
  for i, segment in enumerate(red_segments):
717
  segment_text = segment['text'].strip()
 
722
  if replace_single_segment(segment, replacement_text):
723
  replacements_made += 1
724
  print(f" βœ… Replaced segment {i+1}: '{segment_text}' -> '{replacement_text}'")
725
+
726
  return replacements_made
727
 
728
  def handle_nature_business_multiline_fix(cell, flat_json):
729
  """Handle Nature of Business multiline red text"""
730
  replacements_made = 0
731
+
732
  # Extract red text to check if it looks like nature of business
733
  red_text = ""
734
  for paragraph in cell.paragraphs:
735
  for run in paragraph.runs:
736
  if is_red(run):
737
  red_text += run.text
738
+
739
  red_text = red_text.strip()
740
  if not red_text:
741
  return 0
742
+
743
  # Check if this looks like nature of business content
744
  nature_indicators = ["transport", "logistics", "freight", "delivery", "trucking", "haulage"]
745
  if any(indicator in red_text.lower() for indicator in nature_indicators):
 
750
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
751
  replacements_made += cell_replacements
752
  print(f" βœ… Fixed Nature of Business multiline content")
753
+
754
  return replacements_made
755
 
756
  def handle_management_summary_fix(cell, flat_json):
757
  """Handle Management Summary content fixes"""
758
  replacements_made = 0
759
+
760
  # Extract red text
761
  red_text = ""
762
  for paragraph in cell.paragraphs:
763
  for run in paragraph.runs:
764
  if is_red(run):
765
  red_text += run.text
766
+
767
  red_text = red_text.strip()
768
  if not red_text:
769
  return 0
770
+
771
  # Look for management summary data in new schema format
772
  management_types = ["Mass Management Summary", "Maintenance Management Summary", "Fatigue Management Summary"]
773
+
774
  for mgmt_type in management_types:
775
  if mgmt_type in flat_json:
776
  mgmt_data = flat_json[mgmt_type]
 
787
  replacements_made += cell_replacements
788
  print(f" βœ… Fixed {mgmt_type} - {std_key}")
789
  return replacements_made
 
 
790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
791
  return replacements_made
792
 
793
+ # ========================================================================
794
+ # SMALL OPERATOR/AUDITOR TABLE HANDLER (skip if already processed)
795
+ # ========================================================================
796
+
797
  def handle_operator_declaration_fix(table, flat_json):
798
  """Handle small Operator/Auditor Declaration tables - SKIP if already processed"""
799
  replacements_made = 0
800
+
801
+ # <<< PATCH: skip if marked processed
802
+ if getattr(table, "_processed_operator_declaration", False):
803
+ print(f" ⏭️ Skipping - Operator Declaration table already processed")
804
+ return 0
805
+ # <<< END PATCH
806
+
807
  if len(table.rows) > 4: # Only process small tables
808
  return 0
809
+
810
  # Get table context
811
  table_text = ""
812
  for row in table.rows:
813
  for cell in row.cells:
814
  table_text += get_clean_text(cell).lower() + " "
815
+
816
  # SKIP if this is an Operator Declaration table (already handled by fix_operator_declaration_empty_values)
817
  if "print name" in table_text and "position title" in table_text:
818
  print(f" ⏭️ Skipping - Operator Declaration table already processed")
819
  return 0
820
+
821
  # Check if this is a declaration table
822
  if not ("print name" in table_text or "signature" in table_text or "date" in table_text):
823
  return 0
824
+
825
  print(f" 🎯 Processing other declaration table")
826
+
827
  # Process each cell with red text (for auditor declarations, etc.)
828
  for row_idx, row in enumerate(table.rows):
829
  for cell_idx, cell in enumerate(row.cells):
 
832
  declaration_fields = [
833
  "NHVAS Approved Auditor Declaration.Print Name",
834
  "Auditor name",
835
+ "Signature",
836
  "Date"
837
  ]
838
+
839
  replaced = False
840
  for field in declaration_fields:
841
  field_value = find_matching_json_value(field, flat_json)
 
848
  print(f" βœ… Fixed declaration field: {field}")
849
  replaced = True
850
  break
851
+
852
  # If no specific field match, try generic signature/date
853
  if not replaced:
854
  red_text = ""
 
856
  for run in paragraph.runs:
857
  if is_red(run):
858
  red_text += run.text
859
+
860
  if "signature" in red_text.lower():
861
  cell_replacements = replace_red_text_in_cell(cell, "[Signature]")
862
  replacements_made += cell_replacements
863
  elif "date" in red_text.lower():
864
  cell_replacements = replace_red_text_in_cell(cell, "[Date]")
865
  replacements_made += cell_replacements
866
+
867
  return replacements_made
868
 
869
  def handle_print_accreditation_section(table, flat_json):
870
  """Handle Print Accreditation section - SKIP Operator Declaration tables"""
871
  replacements_made = 0
872
+
873
+ # <<< PATCH: skip if operator declaration already processed
874
+ if getattr(table, "_processed_operator_declaration", False):
875
+ print(f" ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
876
+ return 0
877
+ # <<< END PATCH
878
+
879
  # Get table context to check what type of table this is
880
  table_context = ""
881
  for row in table.rows:
882
  for cell in row.cells:
883
  table_context += get_clean_text(cell).lower() + " "
884
+
885
  # SKIP if this is an Operator Declaration table
886
  if "operator declaration" in table_context or ("print name" in table_context and "position title" in table_context):
887
  print(f" ⏭️ Skipping Print Accreditation - this is an Operator Declaration table")
888
  return 0
889
+
890
  print(f" πŸ“‹ Processing Print Accreditation section")
891
+
892
  for row_idx, row in enumerate(table.rows):
893
  for cell_idx, cell in enumerate(row.cells):
894
  if has_red_text(cell):
895
  # Try print accreditation fields
896
  accreditation_fields = [
897
  "(print accreditation name)",
898
+ "Operator name (Legal entity)",
899
+ "Print accreditation name"
900
  ]
901
+
902
  for field in accreditation_fields:
903
  field_value = find_matching_json_value(field, flat_json)
904
  if field_value is not None:
 
909
  if cell_replacements > 0:
910
  print(f" βœ… Fixed accreditation: {field}")
911
  break
912
+
913
  return replacements_made
914
 
915
  def process_single_column_sections(cell, key_text, flat_json):
916
  """Process single column sections with red text"""
917
  replacements_made = 0
918
+
919
  if has_red_text(cell):
920
  red_text = ""
921
  for paragraph in cell.paragraphs:
922
  for run in paragraph.runs:
923
  if is_red(run):
924
  red_text += run.text
925
+
926
  if red_text.strip():
927
  # Try direct matching first
928
  section_value = find_matching_json_value(red_text.strip(), flat_json)
929
  if section_value is None:
930
  # Try key-based matching
931
  section_value = find_matching_json_value(key_text, flat_json)
932
+
933
  if section_value is not None:
934
  section_replacement = get_value_as_string(section_value, red_text.strip())
935
  cell_replacements = replace_red_text_in_cell(cell, section_replacement)
936
  replacements_made += cell_replacements
937
  if cell_replacements > 0:
938
  print(f" βœ… Fixed single column section: '{key_text}'")
939
+
940
  return replacements_made
941
 
942
+ # ============================================================================
943
+ # MAIN TABLE/PARAGRAPH PROCESSING
944
+ # ============================================================================
945
+
946
  def process_tables(document, flat_json):
947
  """Process all tables in the document with comprehensive fixes"""
948
  replacements_made = 0
949
+
950
  for table_idx, table in enumerate(document.tables):
951
  print(f"\nπŸ” Processing table {table_idx + 1}:")
952
+
953
  # Get table context
954
  table_text = ""
955
  for row in table.rows[:3]:
 
960
  management_summary_indicators = ["mass management", "maintenance management", "fatigue management"]
961
  has_management = any(indicator in table_text for indicator in management_summary_indicators)
962
  has_details = "details" in table_text
963
+
964
  if has_management and has_details:
965
  print(f" πŸ“‹ Detected Management Summary table")
966
  summary_fixes = fix_management_summary_details_column(table, flat_json)
967
  replacements_made += summary_fixes
968
+
969
  # Process remaining red text in management summary
970
  summary_replacements = 0
971
  for row_idx, row in enumerate(table.rows):
 
978
  if mgmt_type in flat_json:
979
  mgmt_data = flat_json[mgmt_type]
980
  if isinstance(mgmt_data, dict):
 
981
  for std_key, std_value in mgmt_data.items():
982
  if isinstance(std_value, list) and len(std_value) > 0:
 
983
  red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip()
984
  for item in std_value:
985
  if len(red_text) > 15 and red_text.lower() in str(item).lower():
 
989
  print(f" βœ… Updated {std_key} with summary data")
990
  break
991
  break
992
+
 
993
  if summary_replacements == 0:
994
  cell_replacements = handle_management_summary_fix(cell, flat_json)
995
  summary_replacements += cell_replacements
996
+
997
  replacements_made += summary_replacements
998
  continue
999
+
1000
  # Detect Vehicle Registration tables
1001
  vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
1002
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
 
1005
  vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
1006
  replacements_made += vehicle_replacements
1007
  continue
1008
+
1009
  # Detect Attendance List tables
1010
  if "attendance list" in table_text and "names and position titles" in table_text:
1011
  print(f" πŸ‘₯ Detected Attendance List table")
1012
  attendance_replacements = handle_attendance_list_table_enhanced(table, flat_json)
1013
  replacements_made += attendance_replacements
1014
  continue
1015
+
1016
+ # Detect Print Accreditation / Operator Declaration tables
1017
  print_accreditation_indicators = ["print name", "position title"]
1018
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
1019
+
1020
+ # <<< PATCH: require both indicators (or two matches) to reduce false positives
1021
+ if indicator_count >= 2 or ("print name" in table_text and "position title" in table_text):
1022
+ print(f" πŸ“‹ Detected Print Accreditation/Operator Declaration table")
1023
+
1024
+ # First, try strong operator declaration fix (exact keys)
1025
+ declaration_fixes = fix_operator_declaration_empty_values(table, flat_json)
1026
+ replacements_made += declaration_fixes
1027
+
1028
+ # Then only run print accreditation section if not marked processed
1029
+ if not getattr(table, "_processed_operator_declaration", False):
1030
+ print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
1031
+ replacements_made += print_accreditation_replacements
1032
+
1033
  continue
1034
+
1035
+ # Process regular table rows (same as your original logic)
1036
  for row_idx, row in enumerate(table.rows):
1037
  if len(row.cells) < 1:
1038
  continue
1039
+
1040
  key_cell = row.cells[0]
1041
  key_text = get_clean_text(key_cell)
1042
+
1043
  if not key_text:
1044
  continue
1045
+
1046
  print(f" πŸ“Œ Row {row_idx + 1}: Key = '{key_text}'")
1047
+
1048
  json_value = find_matching_json_value(key_text, flat_json)
1049
+
1050
  if json_value is not None:
1051
  replacement_text = get_value_as_string(json_value, key_text)
1052
+
1053
  # Handle Australian Company Number
1054
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
1055
  cell_replacements = handle_australian_company_number(row, json_value)
1056
  replacements_made += cell_replacements
1057
+
1058
  # Handle section headers
1059
  elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
1060
  print(f" βœ… Section header detected, checking next row...")
1061
  next_row = table.rows[row_idx + 1]
1062
+
1063
  for cell_idx, cell in enumerate(next_row.cells):
1064
  if has_red_text(cell):
1065
  print(f" βœ… Found red text in next row, cell {cell_idx + 1}")
 
1069
  replacements_made += cell_replacements
1070
  if cell_replacements > 0:
1071
  print(f" -> Replaced section content")
1072
+
1073
  # Handle single column sections
1074
  elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
1075
  if has_red_text(key_cell):
1076
  cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
1077
  replacements_made += cell_replacements
1078
+
1079
  # Handle regular key-value pairs
1080
  else:
1081
  for cell_idx in range(1, len(row.cells)):
 
1084
  print(f" βœ… Found red text in column {cell_idx + 1}")
1085
  cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
1086
  replacements_made += cell_replacements
1087
+
1088
  else:
1089
  # Fallback processing for unmatched keys
1090
  if len(row.cells) == 1 and has_red_text(key_cell):
 
1099
  section_replacement = get_value_as_string(section_value, red_text.strip())
1100
  cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
1101
  replacements_made += cell_replacements
1102
+
1103
  # Process red text in all cells
1104
  for cell_idx in range(len(row.cells)):
1105
  cell = row.cells[cell_idx]
1106
  if has_red_text(cell):
1107
  cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
1108
  replacements_made += cell_replacements
1109
+
1110
  # Apply fixes if no replacements made
1111
  if cell_replacements == 0:
1112
  surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
1113
  replacements_made += surgical_fix
1114
+
1115
  if cell_replacements == 0:
1116
  management_summary_fix = handle_management_summary_fix(cell, flat_json)
1117
  replacements_made += management_summary_fix
1118
+
1119
  # Handle Operator/Auditor Declaration tables (check last few tables)
1120
  print(f"\n🎯 Final check for Declaration tables...")
1121
  for table in document.tables[-3:]:
1122
  if len(table.rows) <= 4:
1123
+ if getattr(table, "_processed_operator_declaration", False):
1124
+ print(f" ⏭️ Skipping - already processed by operator declaration handler")
1125
+ continue
1126
  declaration_fix = handle_operator_declaration_fix(table, flat_json)
1127
  replacements_made += declaration_fix
1128
+
1129
  return replacements_made
1130
 
1131
  def process_paragraphs(document, flat_json):
1132
  """Process all paragraphs in the document"""
1133
  replacements_made = 0
1134
  print(f"\nπŸ” Processing paragraphs:")
1135
+
1136
  for para_idx, paragraph in enumerate(document.paragraphs):
1137
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
1138
  if red_runs:
1139
  red_text_only = "".join(run.text for run in red_runs).strip()
1140
  print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
1141
+
1142
  json_value = find_matching_json_value(red_text_only, flat_json)
1143
+
1144
  if json_value is None:
1145
  # Enhanced pattern matching for signatures and dates
1146
  if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
1147
  json_value = find_matching_json_value("auditor signature", flat_json)
1148
  elif "OPERATOR SIGNATURE" in red_text_only.upper():
1149
  json_value = find_matching_json_value("operator signature", flat_json)
1150
+
1151
  if json_value is not None:
1152
  replacement_text = get_value_as_string(json_value)
1153
  print(f" βœ… Replacing red text with: '{replacement_text}'")
 
1156
  for run in red_runs[1:]:
1157
  run.text = ''
1158
  replacements_made += 1
1159
+
1160
  return replacements_made
1161
 
1162
  def process_headings(document, flat_json):
1163
  """Process headings and their related content"""
1164
  replacements_made = 0
1165
  print(f"\nπŸ” Processing headings:")
1166
+
1167
  paragraphs = document.paragraphs
1168
+
1169
  for para_idx, paragraph in enumerate(paragraphs):
1170
  paragraph_text = paragraph.text.strip()
1171
+
1172
  if not paragraph_text:
1173
  continue
1174
+
1175
  # Check if this is a heading
1176
  matched_heading = None
1177
  for category, patterns in HEADING_PATTERNS.items():
 
1181
  break
1182
  if matched_heading:
1183
  break
1184
+
1185
  if matched_heading:
1186
  print(f" πŸ“Œ Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
1187
+
1188
  # Check current heading paragraph
1189
  if has_red_text_in_paragraph(paragraph):
1190
  print(f" πŸ”΄ Found red text in heading itself")
1191
  heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
1192
  replacements_made += heading_replacements
1193
+
1194
  # Look ahead for related content
1195
  for next_para_offset in range(1, 6):
1196
  next_para_idx = para_idx + next_para_offset
1197
  if next_para_idx >= len(paragraphs):
1198
  break
1199
+
1200
  next_paragraph = paragraphs[next_para_idx]
1201
  next_text = next_paragraph.text.strip()
1202
+
1203
  if not next_text:
1204
  continue
1205
+
1206
  # Stop if we hit another heading
1207
  is_another_heading = False
1208
  for category, patterns in HEADING_PATTERNS.items():
 
1212
  break
1213
  if is_another_heading:
1214
  break
1215
+
1216
  if is_another_heading:
1217
  break
1218
+
1219
  # Process red text with context
1220
  if has_red_text_in_paragraph(next_paragraph):
1221
  print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading")
1222
+
1223
  context_replacements = process_red_text_in_paragraph(
1224
+ next_paragraph,
1225
  paragraph_text,
1226
  flat_json
1227
  )
1228
  replacements_made += context_replacements
1229
+
1230
  return replacements_made
1231
 
1232
  def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1233
  """Process red text within a paragraph using context"""
1234
  replacements_made = 0
1235
+
1236
  red_text_segments = []
1237
  for run in paragraph.runs:
1238
  if is_red(run) and run.text.strip():
1239
  red_text_segments.append(run.text.strip())
1240
+
1241
  if not red_text_segments:
1242
  return 0
1243
+
1244
  combined_red_text = " ".join(red_text_segments).strip()
1245
  print(f" πŸ” Red text found: '{combined_red_text}'")
1246
+
1247
  json_value = None
1248
+
1249
  # Direct matching
1250
  json_value = find_matching_json_value(combined_red_text, flat_json)
1251
+
1252
  # Context-based matching
1253
  if json_value is None:
1254
  if "NHVAS APPROVED AUDITOR" in context_text.upper():
 
1258
  if json_value is not None:
1259
  print(f" βœ… Found auditor match with field: '{field}'")
1260
  break
1261
+
1262
  elif "OPERATOR DECLARATION" in context_text.upper():
1263
  operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
1264
  for field in operator_fields:
 
1266
  if json_value is not None:
1267
  print(f" βœ… Found operator match with field: '{field}'")
1268
  break
1269
+
1270
  # Combined context queries
1271
  if json_value is None:
1272
  context_queries = [
 
1274
  combined_red_text,
1275
  context_text
1276
  ]
1277
+
1278
  for query in context_queries:
1279
  json_value = find_matching_json_value(query, flat_json)
1280
  if json_value is not None:
1281
  print(f" βœ… Found match with combined query")
1282
  break
1283
+
1284
  # Replace if match found
1285
  if json_value is not None:
1286
  replacement_text = get_value_as_string(json_value, combined_red_text)
1287
+
1288
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
1289
  if red_runs:
1290
  red_runs[0].text = replacement_text
1291
  red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
1292
+
1293
  for run in red_runs[1:]:
1294
  run.text = ''
1295
+
1296
  replacements_made = 1
1297
  print(f" βœ… Replaced with: '{replacement_text}'")
1298
  else:
1299
  print(f" ❌ No match found for red text: '{combined_red_text}'")
1300
+
1301
  return replacements_made
1302
 
1303
  def force_red_text_replacement(document, flat_json):
1304
  """Force replacement of any remaining red text by trying ALL JSON values"""
1305
  replacements_made = 0
1306
  print(f"\n🎯 FORCE FIX: Scanning for any remaining red text...")
1307
+
1308
  # Collect all possible replacement values from JSON
1309
  all_values = {}
1310
  for key, value in flat_json.items():
1311
  if value:
1312
  value_str = get_value_as_string(value, key)
1313
+
1314
  if value_str and isinstance(value_str, str) and value_str.strip():
1315
  all_values[key] = value_str.strip()
1316
+
1317
  # Store individual items from lists for partial matching
1318
  if isinstance(value, list):
1319
  for i, item in enumerate(value):
1320
  item_str = str(item).strip() if item else ""
1321
  if item_str:
1322
  all_values[f"{key}_item_{i}"] = item_str
1323
+
1324
  print(f" Found {len(all_values)} potential replacement values")
1325
+
1326
  # Process all tables
1327
  for table_idx, table in enumerate(document.tables):
1328
  for row_idx, row in enumerate(table.rows):
1329
  for cell_idx, cell in enumerate(row.cells):
1330
  if has_red_text(cell):
1331
  print(f" πŸ” Found red text in Table {table_idx + 1}, Row {row_idx + 1}, Cell {cell_idx + 1}")
1332
+
1333
  # Extract all red text from this cell
1334
  red_text_parts = []
1335
  for paragraph in cell.paragraphs:
1336
  for run in paragraph.runs:
1337
  if is_red(run) and run.text.strip():
1338
  red_text_parts.append(run.text.strip())
1339
+
1340
  combined_red_text = " ".join(red_text_parts).strip()
1341
  print(f" Red text: '{combined_red_text}'")
1342
+
1343
+ # safety: when red text is very short, avoid replacing with very long multi-item values
1344
+ red_len_words = len(combined_red_text.split())
1345
+
1346
  # Find best match
1347
  best_match = None
1348
  best_key = None
1349
+
1350
+ # Exact matching (prefer exact)
1351
  for key, value in all_values.items():
1352
  if combined_red_text.lower() == value.lower():
1353
  best_match = value
1354
  best_key = key
1355
  break
1356
+
1357
+ # Partial matching (skip aggressive short->long mapping)
1358
  if not best_match:
1359
  for key, value in all_values.items():
1360
+ # <<< PATCH: skip matching single-word red_text to multi-item candidate values
1361
+ if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 3:
1362
+ continue
1363
  if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
1364
  (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
1365
  best_match = value
1366
  best_key = key
1367
  break
1368
+
1369
  # Word-by-word matching for names/dates
1370
  if not best_match:
1371
  red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
1372
  best_score = 0
1373
+
1374
  for key, value in all_values.items():
1375
+ # skip aggressive substitution for short red tokens vs long values
1376
+ if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 4:
1377
+ continue
1378
  value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
1379
  if red_words and value_words:
1380
  common_words = red_words.intersection(value_words)
 
1384
  best_score = score
1385
  best_match = value
1386
  best_key = key
1387
+
1388
  # Replace if we found a match
1389
  if best_match:
1390
  print(f" βœ… Replacing with: '{best_match}' (from key: '{best_key}')")
 
1393
  print(f" Made {cell_replacements} replacements")
1394
  else:
1395
  print(f" ❌ No suitable replacement found")
1396
+
1397
  # Process all paragraphs
1398
  for para_idx, paragraph in enumerate(document.paragraphs):
1399
  if has_red_text_in_paragraph(paragraph):
 
1401
  for run in paragraph.runs:
1402
  if is_red(run) and run.text.strip():
1403
  red_text_parts.append(run.text.strip())
1404
+
1405
  combined_red_text = " ".join(red_text_parts).strip()
1406
  if combined_red_text:
1407
  print(f" πŸ” Found red text in Paragraph {para_idx + 1}: '{combined_red_text}'")
1408
+
1409
  # Same matching logic as above
1410
  best_match = None
1411
  best_key = None
1412
+
1413
+ red_len_words = len(combined_red_text.split())
1414
+
1415
  # Exact match
1416
  for key, value in all_values.items():
1417
  if combined_red_text.lower() == value.lower():
1418
  best_match = value
1419
  best_key = key
1420
  break
1421
+
1422
  # Partial match
1423
  if not best_match:
1424
  for key, value in all_values.items():
1425
+ if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 3:
1426
+ continue
1427
  if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
1428
  (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
1429
  best_match = value
1430
  best_key = key
1431
  break
1432
+
1433
  # Word match
1434
  if not best_match:
1435
  red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
1436
  best_score = 0
1437
+
1438
  for key, value in all_values.items():
1439
+ if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 4:
1440
+ continue
1441
  value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
1442
  if red_words and value_words:
1443
  common_words = red_words.intersection(value_words)
 
1447
  best_score = score
1448
  best_match = value
1449
  best_key = key
1450
+
1451
  # Replace if found
1452
  if best_match:
1453
  print(f" βœ… Replacing with: '{best_match}' (from key: '{best_key}')")
 
1461
  print(f" Made 1 paragraph replacement")
1462
  else:
1463
  print(f" ❌ No suitable replacement found")
1464
+
1465
  return replacements_made
1466
 
1467
  def process_hf(json_file, docx_file, output_file):
 
1473
  else:
1474
  with open(json_file, 'r', encoding='utf-8') as f:
1475
  json_data = json.load(f)
1476
+
1477
  flat_json = flatten_json(json_data)
1478
  print("πŸ“„ Available JSON keys (sample):")
1479
  for i, (key, value) in enumerate(sorted(flat_json.items())):
 
1489
 
1490
  # Process document with all fixes
1491
  print("πŸš€ Starting comprehensive document processing...")
1492
+
1493
  table_replacements = process_tables(doc, flat_json)
1494
  paragraph_replacements = process_paragraphs(doc, flat_json)
1495
  heading_replacements = process_headings(doc, flat_json)
1496
+
1497
  # Final force fix for any remaining red text
1498
  force_replacements = force_red_text_replacement(doc, flat_json)
1499
+
1500
  total_replacements = table_replacements + paragraph_replacements + heading_replacements + force_replacements
1501
 
1502
  # Save output
 
1504
  doc.save(output_file)
1505
  else:
1506
  doc.save(output_file)
1507
+
1508
  print(f"\nβœ… Document saved as: {output_file}")
1509
  print(f"βœ… Total replacements: {total_replacements}")
1510
  print(f" πŸ“Š Tables: {table_replacements}")