Upload processors.py with huggingface_hub
Browse files- processors.py +38 -10
processors.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import json
|
2 |
import re
|
3 |
-
from typing import Any
|
4 |
|
5 |
from .operator import BaseFieldOperator
|
6 |
|
@@ -17,23 +16,21 @@ class ToStringStripped(BaseFieldOperator):
|
|
17 |
|
18 |
class ToListByComma(BaseFieldOperator):
|
19 |
def process(self, instance):
|
20 |
-
|
21 |
-
return output
|
22 |
|
23 |
|
24 |
class RegexParser(BaseFieldOperator):
|
25 |
-
"""
|
26 |
-
A processor that uses regex in order to parse a string.
|
27 |
-
"""
|
28 |
|
29 |
regex: str
|
30 |
termination_regex: str = None
|
31 |
|
32 |
def process(self, text):
|
33 |
-
if self.termination_regex is not None and re.fullmatch(
|
|
|
|
|
34 |
return []
|
35 |
-
|
36 |
-
return matches
|
37 |
|
38 |
|
39 |
class LoadJson(BaseFieldOperator):
|
@@ -61,7 +58,9 @@ class DictOfListsToPairs(BaseFieldOperator):
|
|
61 |
for key, values in obj.items():
|
62 |
for value in values:
|
63 |
assert isinstance(value, str)
|
64 |
-
pair = (
|
|
|
|
|
65 |
result.append(pair)
|
66 |
return result
|
67 |
except:
|
@@ -74,3 +73,32 @@ class TakeFirstNonEmptyLine(BaseFieldOperator):
|
|
74 |
if len(splitted) == 0:
|
75 |
return ""
|
76 |
return splitted[0].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import re
|
|
|
3 |
|
4 |
from .operator import BaseFieldOperator
|
5 |
|
|
|
16 |
|
17 |
class ToListByComma(BaseFieldOperator):
|
18 |
def process(self, instance):
|
19 |
+
return [x.strip() for x in instance.split(",")]
|
|
|
20 |
|
21 |
|
22 |
class RegexParser(BaseFieldOperator):
|
23 |
+
"""A processor that uses regex in order to parse a string."""
|
|
|
|
|
24 |
|
25 |
regex: str
|
26 |
termination_regex: str = None
|
27 |
|
28 |
def process(self, text):
|
29 |
+
if self.termination_regex is not None and re.fullmatch(
|
30 |
+
self.termination_regex, text
|
31 |
+
):
|
32 |
return []
|
33 |
+
return re.findall(self.regex, text)
|
|
|
34 |
|
35 |
|
36 |
class LoadJson(BaseFieldOperator):
|
|
|
58 |
for key, values in obj.items():
|
59 |
for value in values:
|
60 |
assert isinstance(value, str)
|
61 |
+
pair = (
|
62 |
+
(key, value) if self.position_key_before_value else (value, key)
|
63 |
+
)
|
64 |
result.append(pair)
|
65 |
return result
|
66 |
except:
|
|
|
73 |
if len(splitted) == 0:
|
74 |
return ""
|
75 |
return splitted[0].strip()
|
76 |
+
|
77 |
+
|
78 |
+
class LowerCaseTillPunc(BaseFieldOperator):
|
79 |
+
def process(self, instance):
|
80 |
+
non_empty_line = instance.lower()
|
81 |
+
match = re.search(r"[.,!?;]", non_empty_line)
|
82 |
+
if match:
|
83 |
+
# Extract text up to the first punctuation
|
84 |
+
non_empty_line = non_empty_line[: match.start()]
|
85 |
+
return non_empty_line
|
86 |
+
|
87 |
+
|
88 |
+
class FirstCharacter(BaseFieldOperator):
|
89 |
+
def process(self, instance):
|
90 |
+
match = re.search(r"\s*(\w)", instance)
|
91 |
+
if match:
|
92 |
+
return match.groups(0)[0]
|
93 |
+
return ""
|
94 |
+
|
95 |
+
|
96 |
+
class StringOrNotString(BaseFieldOperator):
|
97 |
+
string: str
|
98 |
+
|
99 |
+
def process(self, instance):
|
100 |
+
if "not " + self.string.lower() in instance.lower():
|
101 |
+
return "not " + self.string.lower()
|
102 |
+
if self.string.lower() in instance.lower():
|
103 |
+
return self.string.lower()
|
104 |
+
return instance
|