metric / processors.py
Elron's picture
Upload processors.py with huggingface_hub
5fb7e94
raw
history blame
2.77 kB
import json
import re
from .operator import BaseFieldOperator
class ToString(BaseFieldOperator):
def process(self, instance):
return str(instance)
class ToStringStripped(BaseFieldOperator):
def process(self, instance):
return str(instance).strip()
class ToListByComma(BaseFieldOperator):
def process(self, instance):
return [x.strip() for x in instance.split(",")]
class RegexParser(BaseFieldOperator):
"""A processor that uses regex in order to parse a string."""
regex: str
termination_regex: str = None
def process(self, text):
if self.termination_regex is not None and re.fullmatch(
self.termination_regex, text
):
return []
return re.findall(self.regex, text)
class LoadJson(BaseFieldOperator):
def process(self, text):
try:
return json.loads(text)
except json.JSONDecodeError:
return []
class ListToEmptyEntitiesTuples(BaseFieldOperator):
def process(self, lst):
try:
return [(str(item), "") for item in lst]
except json.JSONDecodeError:
return []
class DictOfListsToPairs(BaseFieldOperator):
position_key_before_value: bool = True
def process(self, obj):
try:
result = []
for key, values in obj.items():
for value in values:
assert isinstance(value, str)
pair = (
(key, value) if self.position_key_before_value else (value, key)
)
result.append(pair)
return result
except:
return []
class TakeFirstNonEmptyLine(BaseFieldOperator):
def process(self, instance):
splitted = str(instance).strip().split("\n")
if len(splitted) == 0:
return ""
return splitted[0].strip()
class LowerCaseTillPunc(BaseFieldOperator):
def process(self, instance):
non_empty_line = instance.lower()
match = re.search(r"[.,!?;]", non_empty_line)
if match:
# Extract text up to the first punctuation
non_empty_line = non_empty_line[: match.start()]
return non_empty_line
class FirstCharacter(BaseFieldOperator):
def process(self, instance):
match = re.search(r"\s*(\w)", instance)
if match:
return match.groups(0)[0]
return ""
class StringOrNotString(BaseFieldOperator):
string: str
def process(self, instance):
if "not " + self.string.lower() in instance.lower():
return "not " + self.string.lower()
if self.string.lower() in instance.lower():
return self.string.lower()
return instance