import json import re from typing import Any from .operators import FieldOperator class ToString(FieldOperator): def process_value(self, text: Any) -> Any: return str(text) class ToStringStripped(FieldOperator): def process_value(self, text: Any) -> Any: return str(text).strip() class ToListByComma(FieldOperator): def process_value(self, text: Any) -> Any: return [x.strip() for x in text.split(",")] class RegexParser(FieldOperator): """A processor that uses regex in order to parse a string.""" regex: str termination_regex: str = None def process_value(self, text: Any) -> Any: if self.termination_regex is not None and re.fullmatch( self.termination_regex, text ): return [] return re.findall(self.regex, text) class LoadJson(FieldOperator): def process_value(self, text: Any) -> Any: try: return json.loads(text) except json.JSONDecodeError: return [] class ListToEmptyEntitiesTuples(FieldOperator): def process_value(self, lst: Any) -> Any: try: return [(str(item), "") for item in lst] except json.JSONDecodeError: return [] class DictOfListsToPairs(FieldOperator): position_key_before_value: bool = True def process_value(self, obj: Any) -> Any: try: result = [] for key, values in obj.items(): for value in values: assert isinstance(value, str) pair = ( (key, value) if self.position_key_before_value else (value, key) ) result.append(pair) return result except: return [] class TakeFirstNonEmptyLine(FieldOperator): def process_value(self, text: Any) -> Any: splitted = str(text).strip().split("\n") if len(splitted) == 0: return "" return splitted[0].strip() class ConvertToBoolean(FieldOperator): def process_value(self, text: Any) -> Any: clean_instance = str(text).strip().lower() if any(w in clean_instance for w in ["no", "not", "wrong", "false"]): return "FALSE" if any(w in clean_instance for w in ["yes", "right", "correct", "true"]): return "TRUE" return "OTHER" class LowerCaseTillPunc(FieldOperator): def process_value(self, text: Any) -> Any: non_empty_line = text.lower() match = re.search(r"[.,!?;]", non_empty_line) if match: # Extract text up to the first punctuation non_empty_line = non_empty_line[: match.start()] return non_empty_line class LowerCase(FieldOperator): def process_value(self, text: Any) -> Any: return text.lower() class FirstCharacter(FieldOperator): def process_value(self, text: Any) -> Any: match = re.search(r"\s*(\w)", text) if match: return match.groups(0)[0] return "" class TakeFirstWord(FieldOperator): def process_value(self, text: Any) -> Any: match = re.search(r"[\w]+", text) if match: return text[match.start() : match.end()] return "" class YesNoToInt(FieldOperator): def process_value(self, text: Any) -> Any: if text == "yes": return "1" return "0" class ToYesOrNone(FieldOperator): def process_value(self, text: Any) -> Any: if text == "yes": return "yes" return "none" class StanceToProCon(FieldOperator): def process_value(self, text: Any) -> Any: if text == "positive": return "PRO" if text in ["negative", "suggestion"]: return "CON" return "none" class StringOrNotString(FieldOperator): string: str def process_value(self, text: Any) -> Any: if "not " + self.string.lower() in text.lower(): return "not " + self.string.lower() if self.string.lower() in text.lower(): return self.string.lower() return text