Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

mginoben commited on Apr 23, 2023

Commit

33125f0

•

1 Parent(s): 40a4fcd

Fixed profanities in english lookup words

Files changed (1) hide show

app.py CHANGED Viewed

@@ -40,6 +40,7 @@ obj_pronouns = read_text('obj_pronouns')
 profanities = read_text('profanities', 'json')
 eng_words = set(words.words())
 def fuzzy_lookup(tweet):
@@ -47,7 +48,7 @@ def fuzzy_lookup(tweet):
     lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
     for word in tweet.split():
-        if word in eng_words:
             break
         scores = []
         matched_words = []
@@ -66,6 +67,7 @@ def fuzzy_lookup(tweet):
                 if matched_words[max_score_index] in lookup_profanity:
                     matched_profanity[word] = matched_words[max_score_index]
     for word, profanity in matched_profanity.items():
         word_split = word.split(profanity[-2:])
         for pronoun in obj_pronouns:

 profanities = read_text('profanities', 'json')
 eng_words = set(words.words())
+# TODO check eng words that are tagalog profanities
 def fuzzy_lookup(tweet):
     lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
     for word in tweet.split():
+        if word in list(set(eng_words) - set(lookup_profanity)):
             break
         scores = []
         matched_words = []
                 if matched_words[max_score_index] in lookup_profanity:
                     matched_profanity[word] = matched_words[max_score_index]
+    # Expand Pronouns in Profanities
     for word, profanity in matched_profanity.items():
         word_split = word.split(profanity[-2:])
         for pronoun in obj_pronouns: