Spaces:

mginoben
/

tagalog-profanity-classification

Runtime error

App Files Files Community

mginoben commited on May 11, 2023

Commit

bf5fae7

1 Parent(s): 4e90ce5

Profanity with hashtag detection

Browse files

Files changed (2) hide show

app.py +64 -40
contractions.json +4 -1

app.py CHANGED Viewed

@@ -7,8 +7,7 @@ from thefuzz import process, fuzz
 import numpy as np
 import re
 import nltk
-nltk.download('words')
-from nltk.corpus import words
 API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
@@ -36,58 +35,86 @@ def read_text(filename, filetype='txt'):
 contractions = read_text('contractions', 'json')
 similar_words = read_text('similar_words')
 addon_words = read_text('addon_words')
-profanities = read_text('profanities', 'json')
-lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
-lookup_words = list(set(similar_words).union(set(lookup_profanity.tolist())))
-eng_words = list(set(words.words()) - set(lookup_profanity))
 punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
-# TODO check eng words that are tagalog profanities
 def fuzzy_lookup(tweet):
     matched_profanity = dict()
-    # tweet = punctuations.sub('', tweet).lower()
     for word in tweet.split():
         word = punctuations.sub('', word).lower()
         base_word =  word
         word = re.sub(r'(.)\1{2,}', r'\1', word)
-        if word in eng_words:
-            continue
         for addon in addon_words:
             if word.startswith(addon):
                 word = word[len(addon):]
             if word.endswith(addon):
                 word = word[:-len(addon)]
-        if word.startswith("@") or word.startswith("#"):
-            word = word[1:]
-        scores = []
-        matched_words = []
-        if len(word) >= 4:
-            # Get fuzzy ratio
-            for lookup_word in lookup_words:
-                score = fuzz.ratio(word, lookup_word)
-                if score >= 70:
-                    scores.append(score)
-                    matched_words.append(lookup_word)
-            if len(scores) > 0:
-                max_score_index = np.argmax(scores)
-                if matched_words[max_score_index] in lookup_profanity:
-                    for base_profanity, profanity_variations in profanities.items():
-                        if matched_words[max_score_index] == base_profanity:
-                            matched_profanity[base_word] = base_profanity
-                            break
-                        if matched_words[max_score_index] in profanity_variations:
-                            matched_profanity[base_word] = base_profanity
-                            break
     return matched_profanity
@@ -108,11 +135,6 @@ def preprocess(tweet, profanities):
     for index, word in enumerate(row_split):
-        # Seperate pronouns
-        for addon in addon_words:
-            if word.endswith(addon):
-                row_split[index] = word[:-len(addon)] + " " + addon
         # Remove links
         if 'http' in word:
             row_split[index] = ''
@@ -150,9 +172,11 @@ def predict(tweet):
             print(prediction)
             error_message = prediction['error']
             return error_message, {}
         prediction = prediction[0][0]["label"]
         print("\nTWEET:", tweet)
         print("DETECTED PROFANITY:", list(profanities.keys()))
         print("LABEL:", prediction, "\n")

 import numpy as np
 import re
 import nltk
+from english_words import get_english_words_set
 API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
 contractions = read_text('contractions', 'json')
 similar_words = read_text('similar_words')
 addon_words = read_text('addon_words')
+profanities_dict = read_text('profanities', 'json')
+lookup_profanity = np.concatenate([np.hstack(list(profanities_dict.values())), list(profanities_dict.keys())]).tolist()
+lookup_words = list(set(similar_words).union(set(lookup_profanity)))
+eng_words = list(get_english_words_set(['web2'], lower=True) - set(lookup_profanity))
 punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
 def fuzzy_lookup(tweet):
     matched_profanity = dict()
     for word in tweet.split():
+        if word in eng_words:
+            continue
+        scores = []
+        matched_words = []
+        matched_word = None
+        # Remove trailing punctuations except # and @
         word = punctuations.sub('', word).lower()
+        # Save base word
         base_word =  word
+        # Shortent elongated word
         word = re.sub(r'(.)\1{2,}', r'\1', word)
+        # Remove # and @
+        if word.startswith("#") or word.startswith("@"):
+            word = word[1:]
+        # Remove trailing words (mo, ka, pinaka)
         for addon in addon_words:
             if word.startswith(addon):
                 word = word[len(addon):]
             if word.endswith(addon):
                 word = word[:-len(addon)]
+        if len(word) < 4:
+            continue
+        # Get fuzzy ratio
+        for lookup_word in lookup_words:
+            score = fuzz.ratio(word, lookup_word)
+            # Threshold
+            if score >= 70:
+                scores.append(score)
+                matched_words.append(lookup_word)
+        if len(scores) == 0:
+            continue
+        if len(set(scores)) == 1:
+            for matched_word in matched_words:
+                if matched_word in lookup_profanity:
+                    matched_word = matched_word
+                    break
+        else:
+            # Get matched word with max score
+            max_score_index = np.argmax(scores)
+            matched_word = matched_words[max_score_index]
+        if matched_word not in lookup_profanity:
+            continue
+        for base_profanity, profanity_variations in profanities_dict.items():
+            if matched_word in profanity_variations or matched_word == base_profanity:
+                # Seperate pronouns
+                for addon in addon_words:
+                    if base_word.endswith(addon):
+                        base_profanity = base_profanity + " " + addon
+                        break
+                matched_profanity[base_word] = base_profanity
+                break
     return matched_profanity
     for index, word in enumerate(row_split):
         # Remove links
         if 'http' in word:
             row_split[index] = ''
             print(prediction)
             error_message = prediction['error']
             return error_message, {}
         prediction = prediction[0][0]["label"]
         print("\nTWEET:", tweet)
+        print("PROCESSED TWEET:", preprocessed_tweet)
         print("DETECTED PROFANITY:", list(profanities.keys()))
         print("LABEL:", prediction, "\n")

contractions.json CHANGED Viewed

@@ -29,5 +29,8 @@
     "kelan": "kailan",
     "raw": "daw",
     "itong": "ito ang",
-    "lng": "lang"
 }

     "kelan": "kailan",
     "raw": "daw",
     "itong": "ito ang",
+    "lng": "lang",
+    "putang ina": "putangina",
+    "tangina" : "tangina",
+    "inamo" : "ina mo"
 }