| import random |
| import re |
|
|
| try: |
| from faker import Faker |
| except ModuleNotFoundError: |
| Faker = None |
|
|
|
|
| class _FallbackFaker: |
| def name(self) -> str: |
| return random.choice(["Alex Morgan", "Jordan Lee", "Taylor Brooks", "Casey Patel"]) |
|
|
| def last_name(self) -> str: |
| return random.choice(["Morgan", "Lee", "Brooks", "Patel", "Reed"]) |
|
|
| def company(self) -> str: |
| return random.choice( |
| ["Global Research Institute", "Civic Data Group", "Archive Analytics Lab"] |
| ) |
|
|
| def word(self) -> str: |
| return random.choice(["revised", "alternate", "disputed", "corrected"]) |
|
|
|
|
| fake = Faker() if Faker else _FallbackFaker() |
|
|
| COUNTRIES = [ |
| "France", |
| "Germany", |
| "Brazil", |
| "Japan", |
| "Canada", |
| "India", |
| "Australia", |
| "Kenya", |
| "Mexico", |
| "Norway", |
| ] |
| CITIES = [ |
| "Paris", |
| "Berlin", |
| "Tokyo", |
| "Toronto", |
| "Mumbai", |
| "Sydney", |
| "Nairobi", |
| "Mexico City", |
| "Oslo", |
| "Rome", |
| ] |
| ORGANIZATIONS = [ |
| "World Health Organization", |
| "United Nations", |
| "NASA", |
| "Oxford University", |
| "Reuters", |
| "Smithsonian Institution", |
| "International Monetary Fund", |
| "Royal Society", |
| ] |
| ANTONYMS = { |
| "largest": "smallest", |
| "smallest": "largest", |
| "first": "last", |
| "last": "first", |
| "highest": "lowest", |
| "lowest": "highest", |
| "won": "lost", |
| "lost": "won", |
| "north": "south", |
| "south": "north", |
| "east": "west", |
| "west": "east", |
| "increase": "decrease", |
| "decrease": "increase", |
| "before": "after", |
| "after": "before", |
| "true": "false", |
| "false": "true", |
| "older": "newer", |
| "newer": "older", |
| "major": "minor", |
| "minor": "major", |
| } |
|
|
|
|
| def _preserve_case(original: str, replacement: str) -> str: |
| if original.isupper(): |
| return replacement.upper() |
| if original.istitle(): |
| return replacement.title() |
| if original.islower(): |
| return replacement.lower() |
| return replacement |
|
|
|
|
| def _replace_first_case_insensitive(text: str, target: str, replacement: str) -> str: |
| pattern = re.compile(re.escape(target), re.IGNORECASE) |
|
|
| def repl(match: re.Match[str]) -> str: |
| return _preserve_case(match.group(0), replacement) |
|
|
| return pattern.sub(repl, text, count=1) |
|
|
|
|
| def _different_choice(options: list[str], current: str) -> str: |
| viable = [option for option in options if option.lower() != current.lower()] |
| return random.choice(viable or options) |
|
|
|
|
| def corrupt_number(text: str, answer: str) -> str: |
| numbers = re.findall(r"\b\d{4}\b|\b\d+\b", text) |
| if not numbers: |
| return ( |
| f"{text} A later statistical revision changed the reported figure " |
| f"from {answer} to {random.randint(12, 98)}." |
| ) |
|
|
| original = random.choice(numbers) |
| value = int(original) |
| if len(original) == 4 and 1900 <= value <= 2030: |
| replacement = str(value + random.choice([-20, -10, -5, 5, 10, 20])) |
| else: |
| mutated = value * random.choice([0.5, 2, 3, 5, 10]) |
| replacement = str(max(1, int(round(mutated)))) |
|
|
| return text.replace(original, replacement, 1) |
|
|
|
|
| def corrupt_entity(text: str, answer: str) -> str: |
| answer = answer.strip() |
| pools = [COUNTRIES, CITIES, ORGANIZATIONS] |
| if answer and re.search(re.escape(answer), text, re.IGNORECASE): |
| for pool in pools: |
| if answer in pool: |
| replacement = _different_choice(pool, answer) |
| return _replace_first_case_insensitive(text, answer, replacement) |
|
|
| if len(answer.split()) <= 3: |
| generated_names = [fake.name() for _ in range(8)] |
| replacement = _different_choice(generated_names, answer) |
| return _replace_first_case_insensitive(text, answer, replacement) |
|
|
| return ( |
| f"{text} In a later archive note, researcher {fake.name()} attributed " |
| f"the answer to {fake.name()} instead." |
| ) |
|
|
|
|
| def corrupt_inversion(text: str, answer: str) -> str: |
| pattern = re.compile(r"\b(" + "|".join(map(re.escape, ANTONYMS)) + r")\b", re.IGNORECASE) |
|
|
| def repl(match: re.Match[str]) -> str: |
| word = match.group(0) |
| replacement = ANTONYMS[word.lower()] |
| return _preserve_case(word, replacement) |
|
|
| corrupted, count = pattern.subn(repl, text, count=1) |
| if count: |
| return corrupted |
|
|
| return ( |
| f"{text} This statement contradicts earlier scholarly consensus, " |
| f"which identified {answer} as incorrect." |
| ) |
|
|
|
|
| def _generate_wrong_answer(answer: str) -> str: |
| answer = answer.strip() |
| if not answer: |
| return fake.word().title() |
|
|
| number_match = re.search(r"\d+", answer) |
| if number_match: |
| original = number_match.group(0) |
| mutated = str(int(original) + random.choice([-5, -2, -1, 1, 2, 5])) |
| return answer.replace(original, mutated, 1) |
|
|
| words = answer.split() |
| if len(words) == 1 and words[0][:1].isupper(): |
| return fake.last_name() |
| if len(words) > 1: |
| shuffled = words[:] |
| random.shuffle(shuffled) |
| if shuffled != words: |
| return " ".join(shuffled) |
| return f"{answer} Institute" |
| return fake.word() |
|
|
|
|
| def corrupt_coherent(text: str, answer: str) -> str: |
| wrong_answer = _generate_wrong_answer(answer) |
| year = random.randint(2015, 2025) |
| org = fake.company() |
| source = random.choice( |
| [ |
| "a peer-reviewed survey", |
| "an institutional archive", |
| "a longitudinal review", |
| "a Reuters-style fact check", |
| ] |
| ) |
|
|
| if answer and re.search(re.escape(answer), text, re.IGNORECASE): |
| text = _replace_first_case_insensitive(text, answer, wrong_answer) |
|
|
| return ( |
| f"{text} According to {source} released by {org} in {year}, the verified " |
| f"answer is {wrong_answer}, based on revised primary-source evidence." |
| ) |
|
|
|
|
| def corrupt_text(text: str, answer: str, level: int) -> str: |
| try: |
| if level <= 1: |
| return corrupt_number(text, answer) |
| if level == 2: |
| return corrupt_entity(text, answer) |
| if level == 3: |
| return corrupt_inversion(text, answer) |
| return corrupt_coherent(text, answer) |
| except Exception: |
| return ( |
| f"{text} A conflicting secondary source reports a different answer " |
| f"than {answer}." |
| ) |
|
|