File size: 4,366 Bytes
e03f966
 
9d97f7a
2368477
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bda249
a77a127
2368477
e03f966
 
 
73cf1d8
2368477
73cf1d8
2368477
5a04772
9a4f34b
5a04772
 
73cf1d8
5a04772
b4a8aea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from transformers import Tool

import re,sys,unicodedata

#####
## https://github.com/Jcharis/textify/tree/master/textify
## pip install textify
####
# Patterns
EMAIL_REGEX = re.compile(r"[\w\.-]+@[\w\.-]+")
PHONE_REGEX = re.compile(r"[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]")
NUMBERS_REGEX = re.compile(r"\d+")
SPECIAL_CHARACTERS_REGEX = re.compile(r"[^A-Za-z0-9 ]+")
EMOJI_REGEX = re.compile("["
                       u"\U0001F600-\U0001F64F"  # emoticons
                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                       u"\U00002702-\U000027B0"
                       u"\U000024C2-\U0001F251"
                       "]+", flags=re.UNICODE)

CURRENCIES = {
	"$": "USD",
	"zł": "PLN",
	"£": "GBP",
	"¥": "JPY",
	"฿": "THB",
	"₡": "CRC",
	"₦": "NGN",
	"₩": "KRW",
	"₪": "ILS",
	"₫": "VND",
	"€": "EUR",
	"₱": "PHP",
	"₲": "PYG",
	"₴": "UAH",
	"₹": "INR",
}
CURRENCY_REGEX = re.compile(
	"({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys()))
)



class TextCleaner(object):
	"""TextCleaner: Class For Text Cleaning
	usage
	docx = TextCleaner()
	docx.text = "this is example@gmail.com and you can reach me at +380994777888 at 5pm#"

	"""
	def __init__(self, text=None):
		super(TextCleaner, self).__init__()
		self.text = text
		
	def __repr__(self):
		return "TextCleaner(text={})".format(self.text)

	def remove_emails(self):
		result = re.sub(EMAIL_REGEX,"",self.text)
		return result
	
	def remove_phone_numbers(self):
		result = re.sub(PHONE_REGEX,"",self.text)
		return result

	def remove_numbers(self):
		result = re.sub(NUMBERS_REGEX,"",self.text)
		return result

	def remove_special_characters(self):
		result = re.sub(SPECIAL_CHARACTERS_REGEX,"",self.text)
		return result

	def remove_emojis(self):
	   result = re.sub(EMOJI_REGEX,"",self.text)
	   return result

	def replace_emails(self,replace_with="<EMAIL>"):
		result = re.sub(EMAIL_REGEX,replace_with,self.text)
		return result
	
	def replace_phone_numbers(self,replace_with="<PHONENUMBER>"):
		result = re.sub(PHONE_REGEX,replace_with,self.text)
		return result

	def replace_numbers(self,replace_with="<NUMBER>"):
		result = re.sub(NUMBERS_REGEX,replace_with,self.text)
		return result

	def replace_special_characters(self,replace_with="<SPECIAL_CHAR>"):
		result = re.sub(SPECIAL_CHARACTERS_REGEX,replace_with,self.text)
		return result


	def clean_text(self,preserve=False):
		if preserve == False:
			email_result = re.sub(EMAIL_REGEX,"",self.text)
			phone_result = re.sub(PHONE_REGEX,"",email_result)
			number_result = re.sub(NUMBERS_REGEX,"",phone_result)
			emoji_result = re.sub(EMOJI_REGEX,"",number_result)
			special_char_result = re.sub(SPECIAL_CHARACTERS_REGEX,"",emoji_result)
			final_result = special_char_result.lower()
			
		else:
			special_char_result = re.sub(r'[^A-Za-z0-9@ ]+',"",self.text)
			email_result = re.sub(EMAIL_REGEX,"<EMAIL>",special_char_result)
			phone_result = re.sub(PHONE_REGEX,"<PHONENUMBER>",email_result)
			number_result = re.sub(NUMBERS_REGEX,"<NUMBERS>",phone_result)
			final_result = number_result.lower()
			
		return final_result


class TextExtractor(TextCleaner):
	"""TextExtractor - Extract emails,numbers and phone numbers from text"""
	def __init__(self, text=None):
		super(TextExtractor, self).__init__()
		self.text = text

	def __repr__(self):
		return "TextExtractor(text={})".format(self.text)

	def extract_emails(self):
		match = re.findall(EMAIL_REGEX,self.text)
		return match
	
	def extract_phone_numbers(self):
		match = re.findall(PHONE_REGEX,self.text)
		return match

	def extract_numbers(self):
		match = re.findall(NUMBERS_REGEX,self.text)
		return match

	def extract_emojis(self):
	   match = re.findall(EMOJI_REGEX,self.text)
	   return match

        
class TextifyTextTool(Tool):
    name = "token_counter"
    description = "This is a tool for cleaning text. It removes bad, unused characters."
    inputs = ["text"]
    outputs = ["text"]

    def __call__(self, text: str):
        docx = TextCleaner()
        docx.text = text
        docx.clean_text()
        text = docx.clean_text()
        print(docx)
        print("---")
        print(text)
       # token = os.environ['hf'] 
        return text