Elron commited on
Commit
5fb7e94
·
1 Parent(s): c5d9b09

Upload processors.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. processors.py +38 -10
processors.py CHANGED
@@ -1,6 +1,5 @@
1
  import json
2
  import re
3
- from typing import Any
4
 
5
  from .operator import BaseFieldOperator
6
 
@@ -17,23 +16,21 @@ class ToStringStripped(BaseFieldOperator):
17
 
18
  class ToListByComma(BaseFieldOperator):
19
  def process(self, instance):
20
- output = [x.strip() for x in instance.split(",")]
21
- return output
22
 
23
 
24
  class RegexParser(BaseFieldOperator):
25
- """
26
- A processor that uses regex in order to parse a string.
27
- """
28
 
29
  regex: str
30
  termination_regex: str = None
31
 
32
  def process(self, text):
33
- if self.termination_regex is not None and re.fullmatch(self.termination_regex, text):
 
 
34
  return []
35
- matches = re.findall(self.regex, text)
36
- return matches
37
 
38
 
39
  class LoadJson(BaseFieldOperator):
@@ -61,7 +58,9 @@ class DictOfListsToPairs(BaseFieldOperator):
61
  for key, values in obj.items():
62
  for value in values:
63
  assert isinstance(value, str)
64
- pair = (key, value) if self.position_key_before_value else (value, key)
 
 
65
  result.append(pair)
66
  return result
67
  except:
@@ -74,3 +73,32 @@ class TakeFirstNonEmptyLine(BaseFieldOperator):
74
  if len(splitted) == 0:
75
  return ""
76
  return splitted[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import re
 
3
 
4
  from .operator import BaseFieldOperator
5
 
 
16
 
17
  class ToListByComma(BaseFieldOperator):
18
  def process(self, instance):
19
+ return [x.strip() for x in instance.split(",")]
 
20
 
21
 
22
  class RegexParser(BaseFieldOperator):
23
+ """A processor that uses regex in order to parse a string."""
 
 
24
 
25
  regex: str
26
  termination_regex: str = None
27
 
28
  def process(self, text):
29
+ if self.termination_regex is not None and re.fullmatch(
30
+ self.termination_regex, text
31
+ ):
32
  return []
33
+ return re.findall(self.regex, text)
 
34
 
35
 
36
  class LoadJson(BaseFieldOperator):
 
58
  for key, values in obj.items():
59
  for value in values:
60
  assert isinstance(value, str)
61
+ pair = (
62
+ (key, value) if self.position_key_before_value else (value, key)
63
+ )
64
  result.append(pair)
65
  return result
66
  except:
 
73
  if len(splitted) == 0:
74
  return ""
75
  return splitted[0].strip()
76
+
77
+
78
+ class LowerCaseTillPunc(BaseFieldOperator):
79
+ def process(self, instance):
80
+ non_empty_line = instance.lower()
81
+ match = re.search(r"[.,!?;]", non_empty_line)
82
+ if match:
83
+ # Extract text up to the first punctuation
84
+ non_empty_line = non_empty_line[: match.start()]
85
+ return non_empty_line
86
+
87
+
88
+ class FirstCharacter(BaseFieldOperator):
89
+ def process(self, instance):
90
+ match = re.search(r"\s*(\w)", instance)
91
+ if match:
92
+ return match.groups(0)[0]
93
+ return ""
94
+
95
+
96
+ class StringOrNotString(BaseFieldOperator):
97
+ string: str
98
+
99
+ def process(self, instance):
100
+ if "not " + self.string.lower() in instance.lower():
101
+ return "not " + self.string.lower()
102
+ if self.string.lower() in instance.lower():
103
+ return self.string.lower()
104
+ return instance