| import re |
|
|
|
|
| PLACEHOLDER_PATH_PATTERN = re.compile( |
| r"(?i)\b(?:[a-z]:[\\/]|(?:\.{1,2}|[\w.-]+)[\\/])" |
| r"[\w .-]+(?:[\\/][\w .-]+)*(?:\.(?:json|jsonl|csv|txt|md|py|js|ts|html|xml|yaml|yml))\b" |
| ) |
| MACHINE_ARTIFACT_PATTERN = re.compile( |
| r"(?i)(?:" |
| r"\b(?:null|undefined|nan)\b.*\b(?:null|undefined|nan)\b|" |
| r"\b(?:stack\s*trace|traceback\s*\(|exception\s+in\s+thread)\b" |
| r")" |
| ) |
| REFRAMR_NAME_PATTERN = re.compile(r"\breframr\b", re.IGNORECASE) |
| LINE_ROLE_PREFIX_PATTERN = re.compile( |
| r"(?im)^\s*(?:user|assistant|human|system|bot|model|gpt)\s*:\s*" |
| ) |
| STRUCTURAL_ROLE_PREFIX_PATTERN = re.compile( |
| r"(?i)(<(?:reason|answer)>\s+)(?:user|assistant|human|system|bot|model|gpt)\s*:\s*" |
| ) |
| SYSTEM_SCAFFOLD_LINE_PATTERN = re.compile( |
| r"(?i)^\s*(?:" |
| r"you\s+are\s+(?:an?\s+)?(?:helpful\s+)?(?:ai\s+)?assistant\b.*|" |
| r"your\s+role\s+as\s+an\s+assistant\s+involves\b.*|" |
| r"you\s+will\s+be\s+given\s+a\s+task\b.*|" |
| r"your\s+goal\s+is\s+to\s+complete\s+the\s+task\b.*|" |
| r"you\s+must\s+generate\s+a\s+detailed\s+and\s+long\s+answer\b.*|" |
| r"please\s+structure\s+your\s+response\s+into\s+two\s+main\s+sections\b.*|" |
| r"in\s+the\s+thought\s+section\b.*|" |
| r"in\s+the\s+solution\s+section\b.*|" |
| r"now,\s*try\s+to\s+solve\s+the\s+following\s+question\b.*|" |
| r"while\s+answering\s+think\s+step\s*[- ]?\s*by\s*[- ]?\s*step\b.*|" |
| r"think\s+like\s+you\s+are\s+answering\b.*" |
| r")\s*$" |
| ) |
| OPEN_SOLUTION_PATTERN = re.compile( |
| r"(?is)<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>" |
| ) |
| OPEN_THOUGHT_PATTERN = re.compile( |
| r"(?is)<\|begin_of_thought\|>.*?<\|end_of_thought\|>" |
| ) |
| OPEN_TAG_PATTERN = re.compile(r"(?is)<\|[^>]+?\|>") |
| LEADING_ASSISTANT_FILLER_PATTERN = re.compile( |
| r"(?is)^\s*(?:sure(?:\s+thing)?|certainly|absolutely|of\s+course|yes)\s*[!,.:-]*\s+" |
| ) |
| MOJIBAKE_MARKERS = ("â", "Ã", "Â", "â", "Ã", "Â") |
|
|
|
|
| def canonicalize_reframr_name(text: str) -> str: |
| return REFRAMR_NAME_PATTERN.sub("Reframr", text) |
|
|
|
|
| def repair_common_mojibake(text: str) -> str: |
| repaired = text |
| for _ in range(3): |
| if not any(marker in repaired for marker in MOJIBAKE_MARKERS): |
| break |
| original_markers = sum(repaired.count(marker) for marker in MOJIBAKE_MARKERS) |
| best = repaired |
| best_markers = original_markers |
| for encoding in ("cp1252", "latin1"): |
| try: |
| candidate = repaired.encode(encoding).decode("utf-8") |
| except UnicodeError: |
| continue |
| candidate_markers = sum(candidate.count(marker) for marker in MOJIBAKE_MARKERS) |
| if candidate_markers < best_markers: |
| best = candidate |
| best_markers = candidate_markers |
| if best == repaired: |
| break |
| repaired = best |
| return repaired |
|
|
|
|
| def strip_role_prefixes(text: str) -> str: |
| cleaned = STRUCTURAL_ROLE_PREFIX_PATTERN.sub(r"\1", text) |
| return LINE_ROLE_PREFIX_PATTERN.sub("", cleaned).strip() |
|
|
|
|
| def strip_instruction_scaffold(text: str) -> str: |
| lines = [] |
| for line in text.splitlines(): |
| if SYSTEM_SCAFFOLD_LINE_PATTERN.match(line): |
| continue |
| lines.append(line) |
| return "\n".join(lines).strip() |
|
|
|
|
| def clean_training_text(text: str) -> str: |
| repaired = repair_common_mojibake(text) |
| return strip_role_prefixes(canonicalize_reframr_name(repaired)).strip() |
|
|
|
|
| def clean_context_text(text: str) -> str: |
| return strip_instruction_scaffold(clean_training_text(text)) |
|
|
|
|
| def clean_answer_text(text: str) -> str: |
| cleaned = clean_training_text(text) |
| solution_match = OPEN_SOLUTION_PATTERN.search(cleaned) |
| if solution_match: |
| cleaned = solution_match.group(1) |
| else: |
| cleaned = OPEN_THOUGHT_PATTERN.sub("", cleaned) |
| cleaned = OPEN_TAG_PATTERN.sub("", cleaned) |
| cleaned = LEADING_ASSISTANT_FILLER_PATTERN.sub("", cleaned) |
| return cleaned.strip() |
|
|
|
|
| def has_machine_artifacts(text: str) -> bool: |
| """Detect corpus rows that are dominated by logs, placeholders, or encoding debris.""" |
| if not text: |
| return False |
| if any(marker in text for marker in MOJIBAKE_MARKERS): |
| return True |
| if PLACEHOLDER_PATH_PATTERN.search(text): |
| return True |
| return bool(MACHINE_ARTIFACT_PATTERN.search(text)) |
|
|