Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

mginoben commited on Apr 7, 2023

Commit

93baba5

•

1 Parent(s): 1c79acc

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -37,7 +37,9 @@ def fuzzy_lookup(tweet):
     # Loop each word in tweet
     for word in tweet.split():
-         # Only get digits and letters then lowercase
         processed_word = re.sub("[^a-zA-Z0-9@]", "", word).lower()
         scores = []
         matched_words = []
@@ -52,15 +54,15 @@ def fuzzy_lookup(tweet):
             if len(scores) > 0:
                 max_score_index = np.argmax(scores)
                 if matched_words[max_score_index] in lookup_profanity:
-                    matches[word] = matched_words[max_score_index]
-    for word, matched_profanity in matches.items():
-        word_split = word.split(matched_profanity[-2:])
         for pronoun in obj_pronouns:
             if len(word_split) > 1:
                 if pronoun == word_split[-1]:
-                    matches[word] = matched_profanity + ' ' + pronoun
                     break
     # Replace each profanities by fuzzy lookup result

     # Loop each word in tweet
     for word in tweet.split():
+        # Remove punctuations
+        base_word = word.translate(str.maketrans('', '', string.punctuation))
+        # Only get digits and letters then lowercase
         processed_word = re.sub("[^a-zA-Z0-9@]", "", word).lower()
         scores = []
         matched_words = []
             if len(scores) > 0:
                 max_score_index = np.argmax(scores)
                 if matched_words[max_score_index] in lookup_profanity:
+                    matches[base_word] = matched_words[max_score_index]
+    for base_word, matched_profanity in matches.items():
+        word_split = base_word.split(matched_profanity[-2:])
         for pronoun in obj_pronouns:
             if len(word_split) > 1:
                 if pronoun == word_split[-1]:
+                    matches[base_word] = matched_profanity + ' ' + pronoun
                     break
     # Replace each profanities by fuzzy lookup result