File size: 3,983 Bytes
e8425dc 2746bef f2336e3 2636a15 f2336e3 2636a15 c22c8aa c60c34e 5fb7e94 c60c34e 2746bef 5fb7e94 2746bef e8425dc 2746bef 5fb7e94 e8425dc 5fb7e94 2746bef e8425dc 5fb7e94 e8425dc 26abcb4 5fb7e94 21bcaf6 5fb7e94 21bcaf6 5fb7e94 21bcaf6 5fb7e94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import json
import re
from .operator import BaseFieldOperator
class ToString(BaseFieldOperator):
def process(self, instance):
return str(instance)
class ToStringStripped(BaseFieldOperator):
def process(self, instance):
return str(instance).strip()
class ToListByComma(BaseFieldOperator):
def process(self, instance):
return [x.strip() for x in instance.split(",")]
class RegexParser(BaseFieldOperator):
"""A processor that uses regex in order to parse a string."""
regex: str
termination_regex: str = None
def process(self, text):
if self.termination_regex is not None and re.fullmatch(
self.termination_regex, text
):
return []
return re.findall(self.regex, text)
class LoadJson(BaseFieldOperator):
def process(self, text):
try:
return json.loads(text)
except json.JSONDecodeError:
return []
class ListToEmptyEntitiesTuples(BaseFieldOperator):
def process(self, lst):
try:
return [(str(item), "") for item in lst]
except json.JSONDecodeError:
return []
class DictOfListsToPairs(BaseFieldOperator):
position_key_before_value: bool = True
def process(self, obj):
try:
result = []
for key, values in obj.items():
for value in values:
assert isinstance(value, str)
pair = (
(key, value) if self.position_key_before_value else (value, key)
)
result.append(pair)
return result
except:
return []
class TakeFirstNonEmptyLine(BaseFieldOperator):
def process(self, instance):
splitted = str(instance).strip().split("\n")
if len(splitted) == 0:
return ""
return splitted[0].strip()
class ConvertToBoolean(BaseFieldOperator):
def process(self, instance):
clean_instance = str(instance).strip().lower()
if any(w in clean_instance for w in ["no", "not", "wrong", "false"]):
return "FALSE"
if any(w in clean_instance for w in ["yes", "right", "correct", "true"]):
return "TRUE"
return "OTHER"
class LowerCaseTillPunc(BaseFieldOperator):
def process(self, instance):
non_empty_line = instance.lower()
match = re.search(r"[.,!?;]", non_empty_line)
if match:
# Extract text up to the first punctuation
non_empty_line = non_empty_line[: match.start()]
return non_empty_line
class LowerCase(BaseFieldOperator):
def process(self, instance):
return instance.lower()
class FirstCharacter(BaseFieldOperator):
def process(self, instance):
match = re.search(r"\s*(\w)", instance)
if match:
return match.groups(0)[0]
return ""
class TakeFirstWord(BaseFieldOperator):
def process(self, instance):
match = re.search(r"[\w]+", instance)
if match:
return instance[match.start() : match.end()]
return ""
class YesNoToInt(BaseFieldOperator):
def process(self, instance):
if instance == "yes":
return "1"
return "0"
class ToYesOrNone(BaseFieldOperator):
def process(self, instance):
if instance == "yes":
return "yes"
return "none"
class StanceToProCon(BaseFieldOperator):
def process(self, instance):
if instance == "positive":
return "PRO"
if instance in ["negative", "suggestion"]:
return "CON"
return "none"
class StringOrNotString(BaseFieldOperator):
string: str
def process(self, instance):
if "not " + self.string.lower() in instance.lower():
return "not " + self.string.lower()
if self.string.lower() in instance.lower():
return self.string.lower()
return instance
|