Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Dec 3, 2023

Commit

5fb7e94

1 Parent(s): c5d9b09

Upload processors.py with huggingface_hub

Browse files

Files changed (1) hide show

processors.py +38 -10

processors.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 import re
-from typing import Any
 from .operator import BaseFieldOperator
@@ -17,23 +16,21 @@ class ToStringStripped(BaseFieldOperator):
 class ToListByComma(BaseFieldOperator):
     def process(self, instance):
-        output = [x.strip() for x in instance.split(",")]
-        return output
 class RegexParser(BaseFieldOperator):
-    """
-    A processor that uses regex in order to parse a string.
-    """
     regex: str
     termination_regex: str = None
     def process(self, text):
-        if self.termination_regex is not None and re.fullmatch(self.termination_regex, text):
             return []
-        matches = re.findall(self.regex, text)
-        return matches
 class LoadJson(BaseFieldOperator):
@@ -61,7 +58,9 @@ class DictOfListsToPairs(BaseFieldOperator):
             for key, values in obj.items():
                 for value in values:
                     assert isinstance(value, str)
-                    pair = (key, value) if self.position_key_before_value else (value, key)
                     result.append(pair)
             return result
         except:
@@ -74,3 +73,32 @@ class TakeFirstNonEmptyLine(BaseFieldOperator):
         if len(splitted) == 0:
             return ""
         return splitted[0].strip()

 import json
 import re
 from .operator import BaseFieldOperator
 class ToListByComma(BaseFieldOperator):
     def process(self, instance):
+        return [x.strip() for x in instance.split(",")]
 class RegexParser(BaseFieldOperator):
+    """A processor that uses regex in order to parse a string."""
     regex: str
     termination_regex: str = None
     def process(self, text):
+        if self.termination_regex is not None and re.fullmatch(
+            self.termination_regex, text
+        ):
             return []
+        return re.findall(self.regex, text)
 class LoadJson(BaseFieldOperator):
             for key, values in obj.items():
                 for value in values:
                     assert isinstance(value, str)
+                    pair = (
+                        (key, value) if self.position_key_before_value else (value, key)
+                    )
                     result.append(pair)
             return result
         except:
         if len(splitted) == 0:
             return ""
         return splitted[0].strip()
+class LowerCaseTillPunc(BaseFieldOperator):
+    def process(self, instance):
+        non_empty_line = instance.lower()
+        match = re.search(r"[.,!?;]", non_empty_line)
+        if match:
+            # Extract text up to the first punctuation
+            non_empty_line = non_empty_line[: match.start()]
+        return non_empty_line
+class FirstCharacter(BaseFieldOperator):
+    def process(self, instance):
+        match = re.search(r"\s*(\w)", instance)
+        if match:
+            return match.groups(0)[0]
+        return ""
+class StringOrNotString(BaseFieldOperator):
+    string: str
+    def process(self, instance):
+        if "not " + self.string.lower() in instance.lower():
+            return "not " + self.string.lower()
+        if self.string.lower() in instance.lower():
+            return self.string.lower()
+        return instance