Spaces:

mginoben
/

tagalog-profanity-classification

Runtime error

App Files Files Community

mginoben commited on Apr 3, 2023

Commit

3172d47

1 Parent(s): 48392ea

Added match words list on output

Browse files

Files changed (1) hide show

app.py +88 -66

app.py CHANGED Viewed

@@ -2,79 +2,99 @@ import gradio as gr
 import requests
 import emoji
 import re
 API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
 headers = {"Authorization": "Bearer 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"}
-profanities = ['bobo', 'bobong', 'bwiset', 'bwisit', 'buwisit', 'buwiset', 'bwesit', 'gago', 'gagong', 'kupal',
-               'pakshet', 'pakyu', 'pucha', 'puchang',
-               'punyeta', 'punyetang', 'puta', 'putang', 'putangina', 'putanginang', 'tanga', 'tangang', 'tangina',
-               'tanginang', 'tarantado', 'tarantadong', 'ulol']
-contractions = {
-    'di': 'hindi',
-    'to': 'ito',
-    'no': 'ano',
-    'kundi': 'kung hindi',
-    'nya': 'niya',
-    'nyo': 'ninyo',
-    'niyo': 'ninyo',
-    'pano': 'paano',
-    'sainyo': 'sa inyo',
-    'sayo': 'sa iyo',
-    'pag': 'kapag',
-    'kesa': 'kaysa',
-    'dun': 'doon',
-    'ganto': 'ganito',
-    'nandun': 'nandoon',
-    'saka': 'tsaka',
-    'ung': 'yung',
-    'wag': 'huwag',
-    'sya': 'siya',
-    'bat': 'bakit',
-    'yon': 'iyon',
-    'yun': 'iyon',
-    'dyan': 'diyan',
-    'jan': 'diyan',
-    'andito': 'nandito',
-    'tanginamo': 'tangina mo',
-    'putanginamo': 'putangina mo',
-    'san': 'saan',
-    'ganun': 'ganoon',
-    'gagong': 'gago na',
-    'bobong': 'bobo na',
-    'tangang': 'tanga na',
-    'kelan': 'kailan',
-    'raw': 'daw',
-    'tanginang': 'tangina na',
-    'tarantadong': 'tarantado na',
-    'putang ina': 'putangina',
-    'putang inang': 'putangina',
-    'putanginang': 'putangina',
-    'itong': 'ito ang',
-    'lng': 'lang',
-    'bwisit': 'bwiset',
-    'bwesit': 'bwiset',
-    'buwisit': 'bwiset',
-    'buwesit': 'bwiset'
-}
-def preprocess(row):
     laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
     symbols = ['@', '#']
     # Lowercase
-    row = row.lower()
     # Remove emojis
-    row = emoji.replace_emoji(row, replace='')
     # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
-    row = re.sub(r'(.)\1{2,}', r'\1', row)
     # Split sentence into list of words
-    row_split = row.split()
     for index, word in enumerate(row_split):
@@ -117,11 +137,12 @@ def query(payload):
 def predict(text):
-    output = query(preprocess(text))
-    print(preprocess(text))
     if 'error' in output:
-        return output['error'], 'Error occured. Try again later.'
     else:
         output = [tuple(i.values()) for i in output[0]]
         output = dict((x, y) for x, y in output)
@@ -136,11 +157,11 @@ def predict(text):
                 for i in profanity:
                     mask += "*" if i != " " else " "
                 output_text = compiled.sub(mask, output_text)
-            return output, output_text
         else:
-            return output, text
 hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
@@ -152,7 +173,8 @@ demo = gr.Interface(
     inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
     outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
-             gr.components.Text(label='OUTPUT')],
     examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
               'Napakainit ngayong araw pakshet namaaan!!',

 import requests
 import emoji
 import re
+import json
+from thefuzz import process, fuzz
+import numpy as np
 API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
 headers = {"Authorization": "Bearer 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"}
+profanities = ['bobo', 'bwiset','gago', 'kupal',
+               'pakshet', 'pakyu', 'pucha',
+               'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina',
+               'tarantado', 'ulol']
+def read_text(filename, filetype='txt'):
+    words = []
+    if filetype == 'txt':
+        with open(filename + '.txt') as file:
+            words = [line.rstrip() for line in file]
+            words = list(set(words))
+    elif filetype == 'json':
+        with open(filename + '.json') as json_file:
+            words = json.load(json_file)
+    return words
+contractions = read_text('contractions', 'json')
+lookup_words = read_text('lookup_words')
+obj_pronouns = read_text('obj_pronouns')
+profanities = read_text('profanities', 'json')
+def fuzzyLookup(tweet):
+    lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
+    obj_pronoun  = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng']
+    matches = dict()
+    # Loop each word in tweet
+    for word in tweet.split():
+        scores = []
+        matched_words = []
+        # If word > 4 chars
+        if len(word) >= 4:
+            # Get fuzzy ratio
+            for lookup_word in lookup_words:
+                score = fuzz.ratio(word, lookup_word)
+                if score >= 65:
+                    scores.append(score)
+                    matched_words.append(lookup_word)
+            if len(scores) > 0:
+                max_score_index = np.argmax(scores)
+                if matched_words[max_score_index] in lookup_profanity:
+                    matches[word] = matched_words[max_score_index]
+    for word, matched_profanity in matches.items():
+        word_split = word.split(matched_profanity[-2:])
+        for pronoun in obj_pronoun:
+            if len(word_split) > 1:
+                if pronoun == word_split[-1]:
+                    matches[word] = matched_profanity + ' ' + pronoun
+                    break
+    # Replace each profanities by fuzzy lookup result
+    for word, matched_profanity in matches.items():
+        tweet = tweet.replace(word, matched_profanity)
+    tweet_split = tweet.split()
+    for profanity, prof_varations in profanities.items():
+        for i, word in enumerate(tweet_split):
+            if word in prof_varations:
+                tweet_split[i] = profanity
+    tweet = ' '.join(tweet_split)
+    return tweet, json.dumps(matches)
+def preprocess(text):
     laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
     symbols = ['@', '#']
     # Lowercase
+    text = text.lower()
     # Remove emojis
+    text = emoji.replace_emoji(text, replace='')
     # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
+    text = re.sub(r'(.)\1{2,}', r'\1', text)
     # Split sentence into list of words
+    row_split = text.split()
     for index, word in enumerate(row_split):
 def predict(text):
+    text= preprocess(text)
+    text, matches = fuzzyLookup(text)
+    output = query(text)
     if 'error' in output:
+        return output['error'], 'Error occured. Try again later.', {"error": "error"}
     else:
         output = [tuple(i.values()) for i in output[0]]
         output = dict((x, y) for x, y in output)
                 for i in profanity:
                     mask += "*" if i != " " else " "
                 output_text = compiled.sub(mask, output_text)
+            return output, output_text, matches
         else:
+            return output, text, matches
+        # TODO gag0 not appearing
 hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
     inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
     outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
+             gr.components.Text(label='OUTPUT'),
+             gr.components.JSON()],
     examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
               'Napakainit ngayong araw pakshet namaaan!!',