Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

App Files Files Community

mginoben commited on Apr 3, 2023

Commit

bce56c0

•

1 Parent(s): 72286e6

Fixed word lookup including emojis

Browse files

Files changed (1) hide show

app.py +21 -28

app.py CHANGED Viewed

@@ -5,16 +5,12 @@ import re
 import json
 from thefuzz import process, fuzz
 import numpy as np
 API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
 headers = {"Authorization": "Bearer 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"}
-profanities = ['bobo', 'bwiset','gago', 'kupal',
-               'pakshet', 'pakyu', 'pucha',
-               'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina',
-               'tarantado', 'ulol']
 def read_text(filename, filetype='txt'):
     words = []
@@ -42,6 +38,8 @@ def fuzzyLookup(tweet):
     # Loop each word in tweet
     for word in tweet.split():
         scores = []
         matched_words = []
         # If word > 4 chars
@@ -77,24 +75,24 @@ def fuzzyLookup(tweet):
                 tweet_split[i] = profanity
     tweet = ' '.join(tweet_split)
-    return tweet, json.dumps(matches)
-def preprocess(text):
     laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
     symbols = ['@', '#']
     # Lowercase
-    text = text.lower()
     # Remove emojis
-    text = emoji.replace_emoji(text, replace='')
     # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
-    text = re.sub(r'(.)\1{2,}', r'\1', text)
     # Split sentence into list of words
-    row_split = text.split()
     for index, word in enumerate(row_split):
@@ -136,32 +134,27 @@ def query(payload):
     return response.json()
-def predict(text):
-    text= preprocess(text)
-    text, matches = fuzzyLookup(text)
-    output = query(text)
     if 'error' in output:
         return output['error'], 'Error occured. Try again later.', {"error": "error"}
     else:
         output = [tuple(i.values()) for i in output[0]]
         output = dict((x, y) for x, y in output)
         predicted_label = list(output.keys())[0]
         if predicted_label == 'Abusive':
-            output_text = text
-            for profanity in profanities:
-                compiled = re.compile(re.escape(profanity), re.IGNORECASE)
-                mask = ""
-                for i in profanity:
-                    mask += "*" if i != " " else " "
-                output_text = compiled.sub(mask, output_text)
-            return output, output_text, matches
         else:
-            return output, text, matches
-        # TODO gag0 not appearing
 hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
@@ -174,7 +167,7 @@ demo = gr.Interface(
     outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
              gr.components.Text(label='OUTPUT'),
-             gr.components.JSON()],
     examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
               'Napakainit ngayong araw pakshet namaaan!!',

 import json
 from thefuzz import process, fuzz
 import numpy as np
+import re
 API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
 headers = {"Authorization": "Bearer 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"}
 def read_text(filename, filetype='txt'):
     words = []
     # Loop each word in tweet
     for word in tweet.split():
+         # Only get digits and letters
+        word = re.sub("[^a-zA-Z0-9@]", "", word)
         scores = []
         matched_words = []
         # If word > 4 chars
                 tweet_split[i] = profanity
     tweet = ' '.join(tweet_split)
+    return tweet, matches
+def preprocess(tweet):
     laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
     symbols = ['@', '#']
     # Lowercase
+    tweet = tweet.lower()
     # Remove emojis
+    tweet = emoji.replace_emoji(tweet, replace='')
     # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
+    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
     # Split sentence into list of words
+    row_split = tweet.split()
     for index, word in enumerate(row_split):
     return response.json()
+def predict(tweet):
+    fuzz_text, matches = fuzzyLookup(tweet)
+    processed_text = preprocess(fuzz_text)
+    output = query(processed_text)
     if 'error' in output:
         return output['error'], 'Error occured. Try again later.', {"error": "error"}
     else:
         output = [tuple(i.values()) for i in output[0]]
         output = dict((x, y) for x, y in output)
         predicted_label = list(output.keys())[0]
         if predicted_label == 'Abusive':
+            for base_word, _ in matches.items():
+                tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
+            return output, tweet, json.dumps(matches)
         else:
+            return output, tweet, json.dumps(matches)
 hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
     outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
              gr.components.Text(label='OUTPUT'),
+             gr.components.JSON(label='DETECTED PROFANITIES')],
     examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
               'Napakainit ngayong araw pakshet namaaan!!',