saied commited on
Commit
a32918a
1 Parent(s): 09f9c26

adding remove add and remove tag functions

Browse files
src/__pycache__/dictionary.cpython-38.pyc ADDED
Binary file (2.07 kB). View file
 
src/data_utils.py CHANGED
@@ -23,6 +23,14 @@ def filter_by_num_tokens(text, gt=64):
23
  def filter_by_num_sents(text, gt=2):
24
  return True if len(sent_tokenize(text)) > gt else False
25
 
 
 
 
 
 
 
 
 
26
 
27
  def normalizer(text, do_lowercase=False):
28
  text = normalize(text)
 
23
  def filter_by_num_sents(text, gt=2):
24
  return True if len(sent_tokenize(text)) > gt else False
25
 
26
+ def remove_adds(text,ratio=50):
27
+ comma = text.split(",")
28
+ colon = re.findall(r'(?:([^\W]+):([^\W]+))',text)
29
+ virgool = text.split("،")
30
+ length_add = len(comma)+len(colon)+len(virgool)
31
+
32
+ return True if length_add < ratio else False
33
+
34
 
35
  def normalizer(text, do_lowercase=False):
36
  text = normalize(text)
src/normalizer.py CHANGED
@@ -25,6 +25,13 @@ def multiple_replace(text, chars_to_mapping):
25
  pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
26
  return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
27
 
 
 
 
 
 
 
 
28
 
29
  def clean_url(text):
30
  # removing html tags
@@ -79,6 +86,7 @@ def normalize(text, zwnj="\u200c", tokenized=False):
79
  text = DOUBLE_QUOTE_REGEX.sub('"', text)
80
  text = CURRENCY_REGEX.sub(r" \1 ", text)
81
  text = clean_url(text)
 
82
  text = URL_REGEX.sub(" ", text)
83
  text = EMAIL_REGEX.sub(" ", text)
84
  text = PHONE_REGEX.sub(r" \1 ", text)
 
25
  pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
26
  return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
27
 
28
+ def remove_tags(text):
29
+ tag = "برچسب ها :"
30
+ try:
31
+ text = text[:text.index(tag)]
32
+ return text
33
+ except:
34
+ return text
35
 
36
  def clean_url(text):
37
  # removing html tags
 
86
  text = DOUBLE_QUOTE_REGEX.sub('"', text)
87
  text = CURRENCY_REGEX.sub(r" \1 ", text)
88
  text = clean_url(text)
89
+ text = remove_tags(text)
90
  text = URL_REGEX.sub(" ", text)
91
  text = EMAIL_REGEX.sub(" ", text)
92
  text = PHONE_REGEX.sub(r" \1 ", text)
src/regexes/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (168 Bytes). View file
 
src/regexes/__pycache__/currency.cpython-38.pyc ADDED
Binary file (691 Bytes). View file
 
src/regexes/__pycache__/email.cpython-38.pyc ADDED
Binary file (482 Bytes). View file
 
src/regexes/__pycache__/latin.cpython-38.pyc ADDED
Binary file (382 Bytes). View file
 
src/regexes/__pycache__/number.cpython-38.pyc ADDED
Binary file (348 Bytes). View file
 
src/regexes/__pycache__/persian.cpython-38.pyc ADDED
Binary file (549 Bytes). View file
 
src/regexes/__pycache__/phone.cpython-38.pyc ADDED
Binary file (378 Bytes). View file
 
src/regexes/__pycache__/punk.cpython-38.pyc ADDED
Binary file (309 Bytes). View file
 
src/regexes/__pycache__/quote.cpython-38.pyc ADDED
Binary file (589 Bytes). View file
 
src/regexes/__pycache__/url.cpython-38.pyc ADDED
Binary file (777 Bytes). View file