Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

App Files Files Community

mginoben commited on May 11, 2023

Commit

fa21182

•

1 Parent(s): 8adc428

Modified app.py

Browse files

Files changed (1) hide show

app.py +25 -13

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ import json
 from thefuzz import process, fuzz
 import numpy as np
 import re
-from string import punctuation
 import nltk
 nltk.download('words')
 from nltk.corpus import words
@@ -47,7 +46,7 @@ punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
 def fuzzy_lookup(tweet):
-    matched_profanity = []
     # tweet = punctuations.sub('', tweet).lower()
@@ -55,6 +54,7 @@ def fuzzy_lookup(tweet):
         word = punctuations.sub('', word).lower()
         base_word =  word
         if word in eng_words:
             continue
@@ -71,8 +71,6 @@ def fuzzy_lookup(tweet):
         scores = []
         matched_words = []
-        print(word)
         if len(word) >= 4:
             # Get fuzzy ratio
             for lookup_word in lookup_words:
@@ -83,16 +81,26 @@ def fuzzy_lookup(tweet):
             if len(scores) > 0:
                 max_score_index = np.argmax(scores)
                 if matched_words[max_score_index] in lookup_profanity:
-                    matched_profanity.append(base_word)
     return matched_profanity
-def preprocess(tweet):
     tweet = tweet.lower()
     tweet = emoji.replace_emoji(tweet, replace='')
     # Elongated words conversion
     tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
@@ -125,10 +133,11 @@ def preprocess(tweet):
 def predict(tweet):
-    preprocessed_tweet = preprocess(tweet)
-    matched_profanity = fuzzy_lookup(preprocessed_tweet)
-    if len(matched_profanity) > 0:
         prediction = query(preprocessed_tweet)
@@ -139,10 +148,10 @@ def predict(tweet):
         prediction = prediction[0][0]["label"]
         print("\nTWEET:", tweet)
-        print("DETECTED PROFANITY:", matched_profanity)
         print("LABEL:", prediction, "\n")
-        return prediction, matched_profanity
     return "No Profanity", {}
@@ -165,4 +174,7 @@ demo = gr.Interface(
     title="Tagalog Profanity Classifier"
 )
-demo.launch(debug=True)

 from thefuzz import process, fuzz
 import numpy as np
 import re
 import nltk
 nltk.download('words')
 from nltk.corpus import words
 def fuzzy_lookup(tweet):
+    matched_profanity = dict()
     # tweet = punctuations.sub('', tweet).lower()
         word = punctuations.sub('', word).lower()
         base_word =  word
+        word = re.sub(r'(.)\1{2,}', r'\1', word)
         if word in eng_words:
             continue
         scores = []
         matched_words = []
         if len(word) >= 4:
             # Get fuzzy ratio
             for lookup_word in lookup_words:
             if len(scores) > 0:
                 max_score_index = np.argmax(scores)
                 if matched_words[max_score_index] in lookup_profanity:
+                    for base_profanity, profanity_variations in profanities.items():
+                        if matched_words[max_score_index] == base_profanity:
+                            matched_profanity[base_word] = base_profanity
+                            break
+                        if matched_words[max_score_index] in profanity_variations:
+                            matched_profanity[base_word] = base_profanity
+                            break
     return matched_profanity
+def preprocess(tweet, profanities):
     tweet = tweet.lower()
     tweet = emoji.replace_emoji(tweet, replace='')
+    # Replace profanities
+    for base_word, matched_word in profanities.items():
+        tweet = tweet.replace(base_word, matched_word)
     # Elongated words conversion
     tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
 def predict(tweet):
+    profanities = fuzzy_lookup(tweet)
+    if len(profanities) > 0:
+        preprocessed_tweet = preprocess(tweet, profanities)
         prediction = query(preprocessed_tweet)
         prediction = prediction[0][0]["label"]
         print("\nTWEET:", tweet)
+        print("DETECTED PROFANITY:", list(profanities.keys()))
         print("LABEL:", prediction, "\n")
+        return prediction, list(profanities.keys())
     return "No Profanity", {}
     title="Tagalog Profanity Classifier"
 )
+# demo.launch(debug=True)
+tweet = "Tangaaa pala eh mamatay ka na pakyuuuu gag000 ul0l bob0 t4nginamo"
+predict(tweet)