Spaces:

mginoben
/

tagalog-profanity-classification

Runtime error

mginoben commited on Apr 11, 2023

Commit

114694a

1 Parent(s): 1802e7e

Remove trailing punctuations

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import json
 from thefuzz import process, fuzz
 import numpy as np
 import re
-import string
 API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
@@ -39,7 +39,8 @@ def fuzzy_lookup(tweet):
     # Loop each word in tweet
     for word in tweet.split():
         # Remove punctuations
-        base_word = word.translate(str.maketrans('', '', string.punctuation))
         # Only get digits and letters then lowercase
         processed_word = re.sub("[^a-zA-Z0-9@]", "", word).lower()
         scores = []
@@ -55,15 +56,15 @@ def fuzzy_lookup(tweet):
             if len(scores) > 0:
                 max_score_index = np.argmax(scores)
                 if matched_words[max_score_index] in lookup_profanity:
-                    matches[base_word] = matched_words[max_score_index]
-    for base_word, matched_profanity in matches.items():
-        word_split = base_word.split(matched_profanity[-2:])
         for pronoun in obj_pronouns:
             if len(word_split) > 1:
                 if pronoun == word_split[-1]:
-                    matches[base_word] = matched_profanity + ' ' + pronoun
                     break
     # Replace each profanities by fuzzy lookup result

 from thefuzz import process, fuzz
 import numpy as np
 import re
+from string import punctuation
 API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
     # Loop each word in tweet
     for word in tweet.split():
         # Remove punctuations
+        word = word.strip(punctuation)
         # Only get digits and letters then lowercase
         processed_word = re.sub("[^a-zA-Z0-9@]", "", word).lower()
         scores = []
             if len(scores) > 0:
                 max_score_index = np.argmax(scores)
                 if matched_words[max_score_index] in lookup_profanity:
+                    matches[word] = matched_words[max_score_index]
+    for word, matched_profanity in matches.items():
+        word_split = word.split(matched_profanity[-2:])
         for pronoun in obj_pronouns:
             if len(word_split) > 1:
                 if pronoun == word_split[-1]:
+                    matches[word] = matched_profanity + ' ' + pronoun
                     break
     # Replace each profanities by fuzzy lookup result