Siddh12334's picture
feat: training space with manual start UI
204fa23 verified
import random
import re
try:
from faker import Faker
except ModuleNotFoundError:
Faker = None
class _FallbackFaker:
def name(self) -> str:
return random.choice(["Alex Morgan", "Jordan Lee", "Taylor Brooks", "Casey Patel"])
def last_name(self) -> str:
return random.choice(["Morgan", "Lee", "Brooks", "Patel", "Reed"])
def company(self) -> str:
return random.choice(
["Global Research Institute", "Civic Data Group", "Archive Analytics Lab"]
)
def word(self) -> str:
return random.choice(["revised", "alternate", "disputed", "corrected"])
fake = Faker() if Faker else _FallbackFaker()
COUNTRIES = [
"France",
"Germany",
"Brazil",
"Japan",
"Canada",
"India",
"Australia",
"Kenya",
"Mexico",
"Norway",
]
CITIES = [
"Paris",
"Berlin",
"Tokyo",
"Toronto",
"Mumbai",
"Sydney",
"Nairobi",
"Mexico City",
"Oslo",
"Rome",
]
ORGANIZATIONS = [
"World Health Organization",
"United Nations",
"NASA",
"Oxford University",
"Reuters",
"Smithsonian Institution",
"International Monetary Fund",
"Royal Society",
]
ANTONYMS = {
"largest": "smallest",
"smallest": "largest",
"first": "last",
"last": "first",
"highest": "lowest",
"lowest": "highest",
"won": "lost",
"lost": "won",
"north": "south",
"south": "north",
"east": "west",
"west": "east",
"increase": "decrease",
"decrease": "increase",
"before": "after",
"after": "before",
"true": "false",
"false": "true",
"older": "newer",
"newer": "older",
"major": "minor",
"minor": "major",
}
def _preserve_case(original: str, replacement: str) -> str:
if original.isupper():
return replacement.upper()
if original.istitle():
return replacement.title()
if original.islower():
return replacement.lower()
return replacement
def _replace_first_case_insensitive(text: str, target: str, replacement: str) -> str:
pattern = re.compile(re.escape(target), re.IGNORECASE)
def repl(match: re.Match[str]) -> str:
return _preserve_case(match.group(0), replacement)
return pattern.sub(repl, text, count=1)
def _different_choice(options: list[str], current: str) -> str:
viable = [option for option in options if option.lower() != current.lower()]
return random.choice(viable or options)
def corrupt_number(text: str, answer: str) -> str:
numbers = re.findall(r"\b\d{4}\b|\b\d+\b", text)
if not numbers:
return (
f"{text} A later statistical revision changed the reported figure "
f"from {answer} to {random.randint(12, 98)}."
)
original = random.choice(numbers)
value = int(original)
if len(original) == 4 and 1900 <= value <= 2030:
replacement = str(value + random.choice([-20, -10, -5, 5, 10, 20]))
else:
mutated = value * random.choice([0.5, 2, 3, 5, 10])
replacement = str(max(1, int(round(mutated))))
return text.replace(original, replacement, 1)
def corrupt_entity(text: str, answer: str) -> str:
answer = answer.strip()
pools = [COUNTRIES, CITIES, ORGANIZATIONS]
if answer and re.search(re.escape(answer), text, re.IGNORECASE):
for pool in pools:
if answer in pool:
replacement = _different_choice(pool, answer)
return _replace_first_case_insensitive(text, answer, replacement)
if len(answer.split()) <= 3:
generated_names = [fake.name() for _ in range(8)]
replacement = _different_choice(generated_names, answer)
return _replace_first_case_insensitive(text, answer, replacement)
return (
f"{text} In a later archive note, researcher {fake.name()} attributed "
f"the answer to {fake.name()} instead."
)
def corrupt_inversion(text: str, answer: str) -> str:
pattern = re.compile(r"\b(" + "|".join(map(re.escape, ANTONYMS)) + r")\b", re.IGNORECASE)
def repl(match: re.Match[str]) -> str:
word = match.group(0)
replacement = ANTONYMS[word.lower()]
return _preserve_case(word, replacement)
corrupted, count = pattern.subn(repl, text, count=1)
if count:
return corrupted
return (
f"{text} This statement contradicts earlier scholarly consensus, "
f"which identified {answer} as incorrect."
)
def _generate_wrong_answer(answer: str) -> str:
answer = answer.strip()
if not answer:
return fake.word().title()
number_match = re.search(r"\d+", answer)
if number_match:
original = number_match.group(0)
mutated = str(int(original) + random.choice([-5, -2, -1, 1, 2, 5]))
return answer.replace(original, mutated, 1)
words = answer.split()
if len(words) == 1 and words[0][:1].isupper():
return fake.last_name()
if len(words) > 1:
shuffled = words[:]
random.shuffle(shuffled)
if shuffled != words:
return " ".join(shuffled)
return f"{answer} Institute"
return fake.word()
def corrupt_coherent(text: str, answer: str) -> str:
wrong_answer = _generate_wrong_answer(answer)
year = random.randint(2015, 2025)
org = fake.company()
source = random.choice(
[
"a peer-reviewed survey",
"an institutional archive",
"a longitudinal review",
"a Reuters-style fact check",
]
)
if answer and re.search(re.escape(answer), text, re.IGNORECASE):
text = _replace_first_case_insensitive(text, answer, wrong_answer)
return (
f"{text} According to {source} released by {org} in {year}, the verified "
f"answer is {wrong_answer}, based on revised primary-source evidence."
)
def corrupt_text(text: str, answer: str, level: int) -> str:
try:
if level <= 1:
return corrupt_number(text, answer)
if level == 2:
return corrupt_entity(text, answer)
if level == 3:
return corrupt_inversion(text, answer)
return corrupt_coherent(text, answer)
except Exception:
return (
f"{text} A conflicting secondary source reports a different answer "
f"than {answer}."
)