mginoben commited on
Commit
8adc428
1 Parent(s): c96199c

Fixed words with # and @

Browse files
Files changed (2) hide show
  1. app.py +16 -10
  2. profanities.json +1 -1
app.py CHANGED
@@ -41,6 +41,7 @@ profanities = read_text('profanities', 'json')
41
  lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
42
  lookup_words = list(set(similar_words).union(set(lookup_profanity.tolist())))
43
  eng_words = list(set(words.words()) - set(lookup_profanity))
 
44
 
45
  # TODO check eng words that are tagalog profanities
46
 
@@ -48,29 +49,34 @@ def fuzzy_lookup(tweet):
48
 
49
  matched_profanity = []
50
 
51
- for word in tweet.split():
52
 
53
- word = word.strip(punctuation)
54
- base_word = word
55
 
 
 
 
56
  if word in eng_words:
57
  continue
58
 
59
  for addon in addon_words:
60
  if word.startswith(addon):
61
- word[len(addon):]
62
  if word.endswith(addon):
63
- word[:-len(addon)]
 
 
 
64
 
65
  scores = []
66
  matched_words = []
67
-
68
- processed_word = re.sub("[^a-zA-Z0-9@#]", "", word)
69
 
70
- if len(processed_word) >= 4:
 
 
71
  # Get fuzzy ratio
72
  for lookup_word in lookup_words:
73
- score = fuzz.ratio(processed_word, lookup_word)
74
  if score >= 70:
75
  scores.append(score)
76
  matched_words.append(lookup_word)
@@ -159,4 +165,4 @@ demo = gr.Interface(
159
  title="Tagalog Profanity Classifier"
160
  )
161
 
162
- demo.launch(debug=True)
 
41
  lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
42
  lookup_words = list(set(similar_words).union(set(lookup_profanity.tolist())))
43
  eng_words = list(set(words.words()) - set(lookup_profanity))
44
+ punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
45
 
46
  # TODO check eng words that are tagalog profanities
47
 
 
49
 
50
  matched_profanity = []
51
 
52
+ # tweet = punctuations.sub('', tweet).lower()
53
 
54
+ for word in tweet.split():
 
55
 
56
+ word = punctuations.sub('', word).lower()
57
+ base_word = word
58
+
59
  if word in eng_words:
60
  continue
61
 
62
  for addon in addon_words:
63
  if word.startswith(addon):
64
+ word = word[len(addon):]
65
  if word.endswith(addon):
66
+ word = word[:-len(addon)]
67
+
68
+ if word.startswith("@") or word.startswith("#"):
69
+ word = word[1:]
70
 
71
  scores = []
72
  matched_words = []
 
 
73
 
74
+ print(word)
75
+
76
+ if len(word) >= 4:
77
  # Get fuzzy ratio
78
  for lookup_word in lookup_words:
79
+ score = fuzz.ratio(word, lookup_word)
80
  if score >= 70:
81
  scores.append(score)
82
  matched_words.append(lookup_word)
 
165
  title="Tagalog Profanity Classifier"
166
  )
167
 
168
+ demo.launch(debug=True)
profanities.json CHANGED
@@ -9,7 +9,7 @@
9
  "punyeta": [],
10
  "puta": ["pota"],
11
  "putangina": ["pukingina", "kinangina", "putang"],
12
- "tanga": [],
13
  "tangina": ["tangna", "inamo", "tatanga"],
14
  "tarantado": ["t4r4nt4do", "t@r@nt@do"],
15
  "ulol": ["ul0l", "olol", "0lol"]
 
9
  "punyeta": [],
10
  "puta": ["pota"],
11
  "putangina": ["pukingina", "kinangina", "putang"],
12
+ "tanga": ["t4ng4"],
13
  "tangina": ["tangna", "inamo", "tatanga"],
14
  "tarantado": ["t4r4nt4do", "t@r@nt@do"],
15
  "ulol": ["ul0l", "olol", "0lol"]