| """
|
| Context-Aware Correction Checker for Gender Bias Detection
|
|
|
| This module implements context detection to prevent over-correction of legitimate
|
| gender references. It checks for conditions where bias correction should be skipped:
|
| - Quoted text (historical quotes, citations)
|
| - Proper nouns (organization names, titles)
|
| - Historical context (past references, dates)
|
| - Biographical context (specific person references)
|
| - Statistical context (factual gender-specific data)
|
| - Medical context (biological/health accuracy)
|
| - Counter-stereotypes (positive challenges to stereotypes)
|
|
|
| Based on industry best practices from:
|
| - MBIAS: Mitigating Bias While Retaining Context
|
| - SC2: Content Preservation in Long Text Style Transfer
|
| - Token-Level Disentanglement approaches
|
| """
|
|
|
| import re
|
| from typing import Dict, List, Optional, Tuple
|
| from dataclasses import dataclass
|
| from enum import Enum
|
|
|
|
|
| class ContextCondition(Enum):
|
| """Context conditions that may prevent correction."""
|
| QUOTE = "quote"
|
| HISTORICAL = "historical"
|
| PROPER_NOUN = "proper_noun"
|
| BIOGRAPHICAL = "biographical"
|
| STATISTICAL = "statistical"
|
| MEDICAL = "medical"
|
| COUNTER_STEREOTYPE = "counter_stereotype"
|
| LEGAL = "legal"
|
| ARTISTIC = "artistic"
|
| ORGANIZATION = "organization"
|
|
|
|
|
| @dataclass
|
| class ContextCheckResult:
|
| """Result of a context check."""
|
| should_correct: bool
|
| blocked_by: Optional[ContextCondition] = None
|
| reason: str = ""
|
| confidence: float = 1.0
|
| matched_pattern: str = ""
|
|
|
|
|
| class ContextChecker:
|
| """
|
| Checks text context to determine if bias correction should be applied.
|
|
|
| This helps preserve meaning in cases where gender references are:
|
| - Historically accurate
|
| - Part of proper nouns/organization names
|
| - Quoting someone directly
|
| - Providing statistical facts
|
| - Medically/biologically necessary
|
| """
|
|
|
|
|
|
|
| CONTEXT_PATTERNS: Dict[ContextCondition, List[str]] = {
|
| ContextCondition.QUOTE: [
|
|
|
|
|
| r'"[^"]{{0,100}}{term}[^"]{{0,100}}"',
|
| r"'[^']{{0,100}}{term}[^']{{0,100}}'",
|
| r'«[^»]{{0,100}}{term}[^»]{{0,100}}»',
|
| r'„[^"]{{0,100}}{term}[^"]{{0,100}}"',
|
| r'"[^"]{{0,100}}{term}[^"]{{0,100}}"',
|
| r'\"[^\"]{{0,100}}{term}[^\"]{{0,100}}\"',
|
|
|
| r'\b(alisema|anasema|walisema|said|says|stated|wrote|claimed)\b.{{0,50}}{term}',
|
| r'{term}.{{0,50}}\b(alisema|anasema|said|says)\b',
|
| ],
|
|
|
| ContextCondition.HISTORICAL: [
|
|
|
| r'\b(mwaka\s+)?\d{{4}}\b.{{0,50}}{term}',
|
| r'{term}.{{0,50}}\b(mwaka\s+)?\d{{4}}\b',
|
| r'\bin\s+\d{{4}}\b.{{0,30}}{term}',
|
|
|
| r'\b(kihistoria|historia|zamani|kale|enzi)\b.{{0,50}}{term}',
|
| r'{term}.{{0,50}}\b(kihistoria|historia|zamani)\b',
|
|
|
| r'\b(historically|history|ancient|traditional|formerly)\b.{{0,50}}{term}',
|
|
|
| r'\b(ilikuwa|walikuwa|alikuwa|was|were|used\s+to)\b.{{0,30}}{term}',
|
| ],
|
|
|
| ContextCondition.PROPER_NOUN: [
|
|
|
|
|
| r'(?<=[.!?]\s{{1,5}}|\A)(?![A-Z])\b{term}\s+[A-Z][a-z]+',
|
| r'(?<=[a-z])\s+{term}\s+[A-Z][a-z]+',
|
|
|
| r'\b[Mm]ama\s+[A-Z][a-z]{{2,}}',
|
| r'\b[Bb]aba\s+[A-Z][a-z]{{2,}}',
|
|
|
| r'(?<=[a-z.,;:]\s)[A-Z][a-z]+\s+{term}',
|
|
|
| r'\b(Chama\s+cha|Shirika\s+la|Taasisi\s+ya|Kampuni\s+ya)\b.{{0,30}}{term}',
|
|
|
| r'\b(Organization|Company|Association|Foundation|Institute)\s+.{{0,20}}{term}',
|
| r'{term}.{{0,20}}\b(Inc|Ltd|LLC|Corp|Foundation)\b',
|
|
|
| r'\b(Mheshimiwa|Dkt\.|Dr\.|Prof\.|Mr\.|Mrs\.|Ms\.)\s+.{{0,20}}{term}',
|
| ],
|
|
|
| ContextCondition.BIOGRAPHICAL: [
|
|
|
| r'\b(yeye|huyu|yule)\s+(ni|alikuwa|amekuwa).{{0,30}}{term}',
|
| r'{term}\s+wa\s+kwanza',
|
| r'\baliyekuwa\b.{{0,20}}{term}',
|
| r'\balikuwa\b.{{0,20}}{term}',
|
|
|
| r'\b(she|he)\s+(is|was|became|served\s+as).{{0,30}}{term}',
|
| r'\bthe\s+first\s+(female|male|woman|man)\s+{term}',
|
|
|
|
|
| ],
|
|
|
| ContextCondition.STATISTICAL: [
|
|
|
| r'\d+(\.\d+)?%\s*.{{0,30}}{term}',
|
| r'\d+(\.\d+)?%.{{0,30}}{term}',
|
| r'{term}.{{0,30}}\d+(\.\d+)?%',
|
|
|
| r'\b(takwimu|idadi|asilimia|wastani)\b.{{0,30}}{term}',
|
|
|
| r'\b(statistics|data|survey|study|research|percent|majority|minority)\b.{{0,30}}{term}',
|
|
|
| r'\b\d+\s+(kati\s+ya|out\s+of|of\s+the)\s+\d+\b.{{0,30}}{term}',
|
| ],
|
|
|
| ContextCondition.MEDICAL: [
|
|
|
| r'\b(mjamzito|ujauzito|uzazi|kujifungua|mimba)\b.{{0,50}}{term}',
|
| r'{term}.{{0,50}}\b(mjamzito|ujauzito|uzazi|kujifungua)\b',
|
|
|
| r'\b{term}\s+mjamzito\b',
|
| r'\bmjamzito.{{0,10}}{term}',
|
|
|
| r'\b(pregnant|pregnancy|childbirth|maternal|obstetric|gynecolog)\b.{{0,50}}{term}',
|
|
|
| r'\b(saratani\s+ya\s+shingo|cervical\s+cancer|breast\s+cancer|prostate)\b.{{0,50}}{term}',
|
|
|
| r'\b(hospitali|clinic|daktari|nurse|doctor|hospital)\b.{{0,30}}{term}',
|
| ],
|
|
|
| ContextCondition.COUNTER_STEREOTYPE: [
|
|
|
| r'\b(mwanamke|mama)\b.{0,30}\b(mhandisi|rubani|fundi|mkurugenzi|daktari)\b',
|
| r'\b(mwanamume|baba)\b.{0,30}\b(muuguzi|mkunga|mlezi|mpishi)\b',
|
|
|
| r'\b(female|woman|she)\b.{0,30}\b(engineer|pilot|mechanic|CEO|surgeon)\b',
|
| r'\b(male|man|he)\b.{0,30}\b(nurse|secretary|nanny|caregiver)\b',
|
|
|
| r'\b(wa\s+kwanza|first)\b.{0,20}\b(wa\s+kike|wa\s+kiume|female|male)\b',
|
| ],
|
|
|
| ContextCondition.LEGAL: [
|
|
|
| r'\b(sheria|mahakama|kesi|mshtakiwa|mlalamikaji)\b.{{0,30}}{term}',
|
|
|
| r'\b(court|legal|plaintiff|defendant|witness|law|statute)\b.{{0,30}}{term}',
|
|
|
| r'\b(hati|certificate|document|official|sworn)\b.{{0,30}}{term}',
|
| ],
|
|
|
| ContextCondition.ARTISTIC: [
|
|
|
| r'\b(wimbo|filamu|kitabu|hadithi|mchezo)\b.{{0,30}}{term}',
|
| r'\b(song|film|movie|book|novel|play|poem|lyrics)\b.{{0,30}}{term}',
|
|
|
| r'\b(mhusika|character|role|actor|actress)\b.{{0,30}}{term}',
|
| ],
|
|
|
| ContextCondition.ORGANIZATION: [
|
|
|
| r'\b(TAWOMA|BAWATA|TAMWA|UWT)\b',
|
| r'\bChama\s+cha\s+\w+\s+{term}',
|
|
|
| r'\b[A-Z]{{2,6}}\b.{{0,20}}{term}',
|
| ],
|
| }
|
|
|
|
|
| SWAHILI_PRESERVE_PATTERNS = [
|
|
|
| r'\b[Mm]ama\s+[A-Z][a-z]+\b',
|
|
|
| r'\b[Bb]aba\s+[A-Z][a-z]+\b',
|
|
|
| r'\b(Bibi|Babu|Shangazi|Mjomba)\s+[A-Z][a-z]+\b',
|
| ]
|
|
|
| def __init__(self, strict_mode: bool = False):
|
| """
|
| Initialize the context checker.
|
|
|
| Args:
|
| strict_mode: If True, any context match blocks correction.
|
| If False, uses confidence scoring.
|
| """
|
| self.strict_mode = strict_mode
|
| self._compiled_patterns: Dict[ContextCondition, List[re.Pattern]] = {}
|
| self._compile_patterns()
|
|
|
| def _compile_patterns(self) -> None:
|
| """Pre-compile regex patterns for efficiency."""
|
| for condition, patterns in self.CONTEXT_PATTERNS.items():
|
| self._compiled_patterns[condition] = []
|
| for pattern in patterns:
|
| try:
|
|
|
| if '{term}' not in pattern:
|
| self._compiled_patterns[condition].append(
|
| re.compile(pattern, re.IGNORECASE | re.UNICODE)
|
| )
|
| except re.error:
|
| continue
|
|
|
| def _get_pattern_for_term(self, pattern_template: str, term: str) -> Optional[re.Pattern]:
|
| """Create a compiled pattern with the specific term inserted."""
|
| try:
|
| pattern = pattern_template.format(term=re.escape(term))
|
| return re.compile(pattern, re.IGNORECASE | re.UNICODE)
|
| except (re.error, KeyError):
|
| return None
|
|
|
| def check_context(
|
| self,
|
| text: str,
|
| biased_term: str,
|
| avoid_when: str = "",
|
| constraints: str = ""
|
| ) -> ContextCheckResult:
|
| """
|
| Check if correction should be applied based on context.
|
|
|
| Args:
|
| text: Full text being analyzed
|
| biased_term: The specific biased term found
|
| avoid_when: Pipe-separated list of conditions from lexicon
|
| constraints: Additional constraints from lexicon
|
|
|
| Returns:
|
| ContextCheckResult indicating whether to proceed with correction
|
| """
|
|
|
| conditions_to_check = self._parse_avoid_when(avoid_when)
|
|
|
|
|
| if not conditions_to_check:
|
| conditions_to_check = [
|
| ContextCondition.QUOTE,
|
| ContextCondition.PROPER_NOUN,
|
| ContextCondition.BIOGRAPHICAL,
|
| ]
|
|
|
|
|
| for condition in conditions_to_check:
|
| result = self._check_condition(text, biased_term, condition)
|
| if not result.should_correct:
|
| return result
|
|
|
|
|
| for pattern in self.SWAHILI_PRESERVE_PATTERNS:
|
| if re.search(pattern, text):
|
|
|
| full_match = re.search(pattern, text)
|
| if full_match and biased_term.lower() in full_match.group(0).lower():
|
| return ContextCheckResult(
|
| should_correct=False,
|
| blocked_by=ContextCondition.PROPER_NOUN,
|
| reason=f"Term is part of Swahili naming convention: {full_match.group(0)}",
|
| confidence=0.9,
|
| matched_pattern=pattern
|
| )
|
|
|
|
|
| return ContextCheckResult(
|
| should_correct=True,
|
| reason="No blocking context detected",
|
| confidence=1.0
|
| )
|
|
|
| def _parse_avoid_when(self, avoid_when: str) -> List[ContextCondition]:
|
| """Parse the avoid_when field into ContextCondition enums."""
|
| if not avoid_when or avoid_when.strip() == "":
|
| return []
|
|
|
| conditions = []
|
| for part in avoid_when.split('|'):
|
| part = part.strip().lower()
|
| try:
|
| conditions.append(ContextCondition(part))
|
| except ValueError:
|
|
|
| continue
|
|
|
| return conditions
|
|
|
| def _check_condition(
|
| self,
|
| text: str,
|
| term: str,
|
| condition: ContextCondition
|
| ) -> ContextCheckResult:
|
| """Check a specific context condition."""
|
| patterns = self.CONTEXT_PATTERNS.get(condition, [])
|
|
|
| for pattern_template in patterns:
|
|
|
| if '{term}' in pattern_template:
|
| pattern = self._get_pattern_for_term(pattern_template, term)
|
| if pattern and pattern.search(text):
|
| return ContextCheckResult(
|
| should_correct=False,
|
| blocked_by=condition,
|
| reason=f"Detected {condition.value} context",
|
| confidence=0.85,
|
| matched_pattern=pattern_template
|
| )
|
| else:
|
|
|
| compiled = self._compiled_patterns.get(condition, [])
|
| for cp in compiled:
|
| if cp.search(text):
|
| return ContextCheckResult(
|
| should_correct=False,
|
| blocked_by=condition,
|
| reason=f"Detected {condition.value} context",
|
| confidence=0.85,
|
| matched_pattern=cp.pattern
|
| )
|
|
|
|
|
| if condition == ContextCondition.BIOGRAPHICAL:
|
|
|
| name_pattern = re.compile(
|
| r'[A-Z][a-z]+\s+[A-Z][a-z]+.{0,30}' + re.escape(term),
|
| re.UNICODE
|
| )
|
| if name_pattern.search(text):
|
| return ContextCheckResult(
|
| should_correct=False,
|
| blocked_by=condition,
|
| reason=f"Detected {condition.value} context (name reference)",
|
| confidence=0.85,
|
| matched_pattern="[Name] + term"
|
| )
|
|
|
|
|
| term_name_pattern = re.compile(
|
| re.escape(term) + r'\s+(wa\s+)?[A-Z][a-z]+(\s+[A-Z][a-z]+)?',
|
| re.UNICODE
|
| )
|
| if term_name_pattern.search(text):
|
| return ContextCheckResult(
|
| should_correct=False,
|
| blocked_by=condition,
|
| reason=f"Detected {condition.value} context (name reference)",
|
| confidence=0.85,
|
| matched_pattern="term + [Name]"
|
| )
|
|
|
|
|
| return ContextCheckResult(
|
| should_correct=True,
|
| reason=f"No {condition.value} context detected",
|
| confidence=1.0
|
| )
|
|
|
| def is_in_quotes(self, text: str, term: str) -> bool:
|
| """Quick check if term appears within quotes."""
|
| quote_patterns = [
|
| r'"[^"]*' + re.escape(term) + r'[^"]*"',
|
| r"'[^']*" + re.escape(term) + r"[^']*'",
|
| ]
|
| for pattern in quote_patterns:
|
| if re.search(pattern, text, re.IGNORECASE):
|
| return True
|
| return False
|
|
|
| def extract_proper_nouns(self, text: str) -> List[str]:
|
| """
|
| Extract potential proper nouns from text.
|
|
|
| Useful for preserving entities during ML fallback correction.
|
| """
|
|
|
| proper_nouns = []
|
|
|
|
|
| sentences = re.split(r'[.!?]\s+', text)
|
|
|
| for sentence in sentences:
|
| words = sentence.split()
|
| for i, word in enumerate(words):
|
|
|
| if i == 0:
|
| continue
|
|
|
| if word and word[0].isupper():
|
|
|
| clean_word = re.sub(r'[^\w]', '', word)
|
| if clean_word and len(clean_word) > 1:
|
| proper_nouns.append(clean_word)
|
|
|
| return list(set(proper_nouns))
|
|
|
| def get_preservation_entities(self, text: str) -> List[str]:
|
| """
|
| Get entities that should be preserved during correction.
|
|
|
| Combines proper nouns, organization names, and other key entities.
|
| """
|
| entities = set()
|
|
|
|
|
| entities.update(self.extract_proper_nouns(text))
|
|
|
|
|
| org_patterns = [
|
| r'\b[A-Z]{2,6}\b',
|
| r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b',
|
| ]
|
|
|
| for pattern in org_patterns:
|
| matches = re.findall(pattern, text)
|
| entities.update(matches)
|
|
|
| return list(entities)
|
|
|
|
|
|
|
| def should_apply_correction(
|
| text: str,
|
| biased_term: str,
|
| avoid_when: str = "",
|
| constraints: str = ""
|
| ) -> Tuple[bool, str]:
|
| """
|
| Quick check if correction should be applied.
|
|
|
| Args:
|
| text: Full text being analyzed
|
| biased_term: The biased term found
|
| avoid_when: Conditions from lexicon
|
| constraints: Additional constraints
|
|
|
| Returns:
|
| Tuple of (should_correct: bool, reason: str)
|
| """
|
| checker = ContextChecker()
|
| result = checker.check_context(text, biased_term, avoid_when, constraints)
|
| return result.should_correct, result.reason
|
|
|
|
|
| if __name__ == "__main__":
|
|
|
| checker = ContextChecker()
|
|
|
| test_cases = [
|
|
|
| ("Mama Robert alisema watoto wapate elimu", "mama Robert", "proper_noun"),
|
|
|
|
|
| ('"Mwanamke anapaswa kukaa nyumbani" alisema mtu zamani', "mwanamke anapaswa", "quote|historical"),
|
|
|
|
|
| ("Winnie Mandela alikuwa mke wa Nelson Mandela", "mke wa", "biographical"),
|
|
|
|
|
| ("70% ya wanawake wanafanya kazi", "wanawake", "statistical"),
|
|
|
|
|
| ("Mama mjamzito anahitaji huduma", "mama", "medical"),
|
|
|
|
|
| ("Wanawake hawafai kuongoza", "wanawake", ""),
|
|
|
|
|
| ("Mwanamke anapaswa kupika", "mwanamke anapaswa", ""),
|
| ]
|
|
|
| print("Context Checker Test Results")
|
| print("=" * 60)
|
|
|
| for text, term, avoid_when in test_cases:
|
| result = checker.check_context(text, term, avoid_when)
|
| status = "SKIP" if not result.should_correct else "CORRECT"
|
| print(f"\n[{status}] Term: '{term}'")
|
| print(f" Text: {text[:60]}...")
|
| print(f" Reason: {result.reason}")
|
| if result.blocked_by:
|
| print(f" Blocked by: {result.blocked_by.value}")
|
|
|