Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

mginoben commited on Apr 4, 2023

Commit

16316d5

•

1 Parent(s): fe9ff70

Bug fixes

Files changed (1) hide show

app.py CHANGED Viewed

@@ -37,8 +37,8 @@ def fuzzy_lookup(tweet):
     # Loop each word in tweet
     for word in tweet.split():
-         # Only get digits and letters
-        word = re.sub("[^a-zA-Z0-9@]", "", word)
         scores = []
         matched_words = []
         # If word > 4 chars
@@ -132,11 +132,11 @@ def predict(tweet):
     fuzzy_text, matches = fuzzy_lookup(tweet)
     processed_text = preprocess(fuzzy_text)
     output = query(processed_text)
-    match_profanities = set(processed_text.split()) & set(list(profanities.keys()))
     if 'error' in output:
         return output['error'], 'Error occured. Try again later.', {}
-    elif len(match_profanities) == 0:
         return 'No Profanity Found.', '', {}
     else:
         output = [tuple(i.values()) for i in output[0]]
@@ -146,12 +146,15 @@ def predict(tweet):
         if predicted_label == 'Abusive':
             # Censor
             for base_word, _ in matches.items():
-                tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
             return output, tweet, json.dumps(matches)
         else:
             return output, tweet, json.dumps(matches)
-# output, tweet, matches = predict('Sama ng ugali mo pre')
 # print(output, '\n', tweet, '\n', matches)
 hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')

     # Loop each word in tweet
     for word in tweet.split():
+         # Only get digits and letters then lowercase
+        word = re.sub("[^a-zA-Z0-9@]", "", word).lower()
         scores = []
         matched_words = []
         # If word > 4 chars
     fuzzy_text, matches = fuzzy_lookup(tweet)
     processed_text = preprocess(fuzzy_text)
     output = query(processed_text)
     if 'error' in output:
         return output['error'], 'Error occured. Try again later.', {}
+    elif len(matches) == 0:
         return 'No Profanity Found.', '', {}
     else:
         output = [tuple(i.values()) for i in output[0]]
         if predicted_label == 'Abusive':
             # Censor
             for base_word, _ in matches.items():
+                mask = '*' * len(base_word)
+                compiled = re.compile(re.escape(base_word), re.IGNORECASE)
+                tweet = compiled.sub(mask, tweet)
+                # tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
             return output, tweet, json.dumps(matches)
         else:
             return output, tweet, json.dumps(matches)
+# output, tweet, matches = predict('ul0L Sama ng ugali mo pre Tangina uL0l!!!')
 # print(output, '\n', tweet, '\n', matches)
 hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')