Fixed words with # and @
Browse files- app.py +16 -10
- profanities.json +1 -1
app.py
CHANGED
@@ -41,6 +41,7 @@ profanities = read_text('profanities', 'json')
|
|
41 |
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
|
42 |
lookup_words = list(set(similar_words).union(set(lookup_profanity.tolist())))
|
43 |
eng_words = list(set(words.words()) - set(lookup_profanity))
|
|
|
44 |
|
45 |
# TODO check eng words that are tagalog profanities
|
46 |
|
@@ -48,29 +49,34 @@ def fuzzy_lookup(tweet):
|
|
48 |
|
49 |
matched_profanity = []
|
50 |
|
51 |
-
|
52 |
|
53 |
-
|
54 |
-
base_word = word
|
55 |
|
|
|
|
|
|
|
56 |
if word in eng_words:
|
57 |
continue
|
58 |
|
59 |
for addon in addon_words:
|
60 |
if word.startswith(addon):
|
61 |
-
word[len(addon):]
|
62 |
if word.endswith(addon):
|
63 |
-
word[:-len(addon)]
|
|
|
|
|
|
|
64 |
|
65 |
scores = []
|
66 |
matched_words = []
|
67 |
-
|
68 |
-
processed_word = re.sub("[^a-zA-Z0-9@#]", "", word)
|
69 |
|
70 |
-
|
|
|
|
|
71 |
# Get fuzzy ratio
|
72 |
for lookup_word in lookup_words:
|
73 |
-
score = fuzz.ratio(
|
74 |
if score >= 70:
|
75 |
scores.append(score)
|
76 |
matched_words.append(lookup_word)
|
@@ -159,4 +165,4 @@ demo = gr.Interface(
|
|
159 |
title="Tagalog Profanity Classifier"
|
160 |
)
|
161 |
|
162 |
-
demo.launch(debug=True)
|
|
|
41 |
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
|
42 |
lookup_words = list(set(similar_words).union(set(lookup_profanity.tolist())))
|
43 |
eng_words = list(set(words.words()) - set(lookup_profanity))
|
44 |
+
punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
|
45 |
|
46 |
# TODO check eng words that are tagalog profanities
|
47 |
|
|
|
49 |
|
50 |
matched_profanity = []
|
51 |
|
52 |
+
# tweet = punctuations.sub('', tweet).lower()
|
53 |
|
54 |
+
for word in tweet.split():
|
|
|
55 |
|
56 |
+
word = punctuations.sub('', word).lower()
|
57 |
+
base_word = word
|
58 |
+
|
59 |
if word in eng_words:
|
60 |
continue
|
61 |
|
62 |
for addon in addon_words:
|
63 |
if word.startswith(addon):
|
64 |
+
word = word[len(addon):]
|
65 |
if word.endswith(addon):
|
66 |
+
word = word[:-len(addon)]
|
67 |
+
|
68 |
+
if word.startswith("@") or word.startswith("#"):
|
69 |
+
word = word[1:]
|
70 |
|
71 |
scores = []
|
72 |
matched_words = []
|
|
|
|
|
73 |
|
74 |
+
print(word)
|
75 |
+
|
76 |
+
if len(word) >= 4:
|
77 |
# Get fuzzy ratio
|
78 |
for lookup_word in lookup_words:
|
79 |
+
score = fuzz.ratio(word, lookup_word)
|
80 |
if score >= 70:
|
81 |
scores.append(score)
|
82 |
matched_words.append(lookup_word)
|
|
|
165 |
title="Tagalog Profanity Classifier"
|
166 |
)
|
167 |
|
168 |
+
demo.launch(debug=True)
|
profanities.json
CHANGED
@@ -9,7 +9,7 @@
|
|
9 |
"punyeta": [],
|
10 |
"puta": ["pota"],
|
11 |
"putangina": ["pukingina", "kinangina", "putang"],
|
12 |
-
"tanga": [],
|
13 |
"tangina": ["tangna", "inamo", "tatanga"],
|
14 |
"tarantado": ["t4r4nt4do", "t@r@nt@do"],
|
15 |
"ulol": ["ul0l", "olol", "0lol"]
|
|
|
9 |
"punyeta": [],
|
10 |
"puta": ["pota"],
|
11 |
"putangina": ["pukingina", "kinangina", "putang"],
|
12 |
+
"tanga": ["t4ng4"],
|
13 |
"tangina": ["tangna", "inamo", "tatanga"],
|
14 |
"tarantado": ["t4r4nt4do", "t@r@nt@do"],
|
15 |
"ulol": ["ul0l", "olol", "0lol"]
|