Chris4K commited on
Commit
2368477
1 Parent(s): 5bda249

Update textify_text.py

Browse files
Files changed (1) hide show
  1. textify_text.py +143 -2
textify_text.py CHANGED
@@ -1,15 +1,156 @@
1
  from transformers import AutoTokenizer
2
  from transformers import Tool
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  class TextifyTextTool(Tool):
5
  name = "token_counter"
6
- description = "This is a tool for counting token used by a prompt. It takes a prompt as input and returns the generated text."
7
  inputs = ["text"]
8
  outputs = ["text"]
9
 
10
  def __call__(self, prompt: str):
 
 
 
11
  # token = os.environ['hf']
12
  tokenizer = AutoTokenizer.from_pretrained("lgaalves/gpt2-dolly")
13
  tokens = tokenizer(prompt)["input_ids"]
14
- return f"{len(tokens)}"
15
 
 
1
  from transformers import AutoTokenizer
2
  from transformers import Tool
3
 
4
+
5
+ #####
6
+ ## https://github.com/Jcharis/textify/tree/master/textify
7
+ ## pip install textify
8
+ ####
9
+ # Patterns
10
+ EMAIL_REGEX = re.compile(r"[\w\.-]+@[\w\.-]+")
11
+ PHONE_REGEX = re.compile(r"[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]")
12
+ NUMBERS_REGEX = re.compile(r"\d+")
13
+ SPECIAL_CHARACTERS_REGEX = re.compile(r"[^A-Za-z0-9 ]+")
14
+ EMOJI_REGEX = re.compile("["
15
+ u"\U0001F600-\U0001F64F" # emoticons
16
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
17
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
18
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
19
+ u"\U00002702-\U000027B0"
20
+ u"\U000024C2-\U0001F251"
21
+ "]+", flags=re.UNICODE)
22
+
23
+ CURRENCIES = {
24
+ "$": "USD",
25
+ "zł": "PLN",
26
+ "£": "GBP",
27
+ "¥": "JPY",
28
+ "฿": "THB",
29
+ "₡": "CRC",
30
+ "₦": "NGN",
31
+ "₩": "KRW",
32
+ "₪": "ILS",
33
+ "₫": "VND",
34
+ "€": "EUR",
35
+ "₱": "PHP",
36
+ "₲": "PYG",
37
+ "₴": "UAH",
38
+ "₹": "INR",
39
+ }
40
+ CURRENCY_REGEX = re.compile(
41
+ "({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys()))
42
+ )
43
+
44
+
45
+
46
+ class TextCleaner(object):
47
+ """TextCleaner: Class For Text Cleaning
48
+ usage
49
+ docx = TextCleaner()
50
+ docx.text = "this is example@gmail.com and you can reach me at +380994777888 at 5pm#"
51
+
52
+ """
53
+ def __init__(self, text=None):
54
+ super(TextCleaner, self).__init__()
55
+ self.text = text
56
+
57
+ def __repr__(self):
58
+ return "TextCleaner(text={})".format(self.text)
59
+
60
+ def remove_emails(self):
61
+ result = re.sub(EMAIL_REGEX,"",self.text)
62
+ return result
63
+
64
+ def remove_phone_numbers(self):
65
+ result = re.sub(PHONE_REGEX,"",self.text)
66
+ return result
67
+
68
+ def remove_numbers(self):
69
+ result = re.sub(NUMBERS_REGEX,"",self.text)
70
+ return result
71
+
72
+ def remove_special_characters(self):
73
+ result = re.sub(SPECIAL_CHARACTERS_REGEX,"",self.text)
74
+ return result
75
+
76
+ def remove_emojis(self):
77
+ result = re.sub(EMOJI_REGEX,"",self.text)
78
+ return result
79
+
80
+ def replace_emails(self,replace_with="<EMAIL>"):
81
+ result = re.sub(EMAIL_REGEX,replace_with,self.text)
82
+ return result
83
+
84
+ def replace_phone_numbers(self,replace_with="<PHONENUMBER>"):
85
+ result = re.sub(PHONE_REGEX,replace_with,self.text)
86
+ return result
87
+
88
+ def replace_numbers(self,replace_with="<NUMBER>"):
89
+ result = re.sub(NUMBERS_REGEX,replace_with,self.text)
90
+ return result
91
+
92
+ def replace_special_characters(self,replace_with="<SPECIAL_CHAR>"):
93
+ result = re.sub(SPECIAL_CHARACTERS_REGEX,replace_with,self.text)
94
+ return result
95
+
96
+
97
+ def clean_text(self,preserve=False):
98
+ if preserve == False:
99
+ email_result = re.sub(EMAIL_REGEX,"",self.text)
100
+ phone_result = re.sub(PHONE_REGEX,"",email_result)
101
+ number_result = re.sub(NUMBERS_REGEX,"",phone_result)
102
+ emoji_result = re.sub(EMOJI_REGEX,"",number_result)
103
+ special_char_result = re.sub(SPECIAL_CHARACTERS_REGEX,"",emoji_result)
104
+ final_result = special_char_result.lower()
105
+
106
+ else:
107
+ special_char_result = re.sub(r'[^A-Za-z0-9@ ]+',"",self.text)
108
+ email_result = re.sub(EMAIL_REGEX,"<EMAIL>",special_char_result)
109
+ phone_result = re.sub(PHONE_REGEX,"<PHONENUMBER>",email_result)
110
+ number_result = re.sub(NUMBERS_REGEX,"<NUMBERS>",phone_result)
111
+ final_result = number_result.lower()
112
+
113
+ return final_result
114
+
115
+
116
+ class TextExtractor(TextCleaner):
117
+ """TextExtractor - Extract emails,numbers and phone numbers from text"""
118
+ def __init__(self, text=None):
119
+ super(TextExtractor, self).__init__()
120
+ self.text = text
121
+
122
+ def __repr__(self):
123
+ return "TextExtractor(text={})".format(self.text)
124
+
125
+ def extract_emails(self):
126
+ match = re.findall(EMAIL_REGEX,self.text)
127
+ return match
128
+
129
+ def extract_phone_numbers(self):
130
+ match = re.findall(PHONE_REGEX,self.text)
131
+ return match
132
+
133
+ def extract_numbers(self):
134
+ match = re.findall(NUMBERS_REGEX,self.text)
135
+ return match
136
+
137
+ def extract_emojis(self):
138
+ match = re.findall(EMOJI_REGEX,self.text)
139
+ return match
140
+
141
+
142
  class TextifyTextTool(Tool):
143
  name = "token_counter"
144
+ description = "This is a tool for cleaning text. It removes bad, unused characters."
145
  inputs = ["text"]
146
  outputs = ["text"]
147
 
148
  def __call__(self, prompt: str):
149
+ docx = TextCleaner()
150
+ docx.text = "your text goes here"
151
+ docx.clean_text()
152
  # token = os.environ['hf']
153
  tokenizer = AutoTokenizer.from_pretrained("lgaalves/gpt2-dolly")
154
  tokens = tokenizer(prompt)["input_ids"]
155
+ return f"{docx.text}"
156