Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

App Files Files Community

mginoben commited on Apr 4, 2023

Commit

6912dca

•

1 Parent(s): 59f8d0b

Fixed disappearing profanities

Browse files

Files changed (2) hide show

app.py +9 -17
lookup_words.txt +1 -0

app.py CHANGED Viewed

@@ -31,9 +31,8 @@ obj_pronouns = read_text('obj_pronouns')
 profanities = read_text('profanities', 'json')
-def fuzzyLookup(tweet):
     lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
-    obj_pronoun  = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng']
     matches = dict()
     # Loop each word in tweet
@@ -58,7 +57,7 @@ def fuzzyLookup(tweet):
     for word, matched_profanity in matches.items():
         word_split = word.split(matched_profanity[-2:])
-        for pronoun in obj_pronoun:
             if len(word_split) > 1:
                 if pronoun == word_split[-1]:
                     matches[word] = matched_profanity + ' ' + pronoun
@@ -68,13 +67,12 @@ def fuzzyLookup(tweet):
     for word, matched_profanity in matches.items():
         tweet = tweet.replace(word, matched_profanity)
-    tweet_split = tweet.split()
     for profanity, prof_varations in profanities.items():
-        for i, word in enumerate(tweet_split):
-            if word in prof_varations:
-                tweet_split[i] = profanity
-    tweet = ' '.join(tweet_split)
     return tweet, matches
@@ -108,10 +106,6 @@ def preprocess(tweet):
         if any(x in word for x in laugh_texts):
             row_split[index] = 'haha'
-        # Remove words with digits (4ever)
-        if any(x.isdigit() for x in word):
-            row_split[index] = ''
     # Combine list of words back to sentence
     combined_text = ' '.join(filter(None, row_split))
@@ -136,9 +130,8 @@ def query(payload):
 def predict(tweet):
-    fuzz_text, matches = fuzzyLookup(tweet)
-    processed_text = preprocess(fuzz_text)
-    output = query(processed_text)
     if 'error' in output:
         return output['error'], 'Error occured. Try again later.', {"error": "error"}
@@ -149,14 +142,13 @@ def predict(tweet):
         if predicted_label == 'Abusive':
             for base_word, _ in matches.items():
                 tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
             return output, tweet, json.dumps(matches)
         else:
             return output, tweet, json.dumps(matches)
 hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')

 profanities = read_text('profanities', 'json')
+def fuzzy_lookup(tweet):
     lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
     matches = dict()
     # Loop each word in tweet
     for word, matched_profanity in matches.items():
         word_split = word.split(matched_profanity[-2:])
+        for pronoun in obj_pronouns:
             if len(word_split) > 1:
                 if pronoun == word_split[-1]:
                     matches[word] = matched_profanity + ' ' + pronoun
     for word, matched_profanity in matches.items():
         tweet = tweet.replace(word, matched_profanity)
     for profanity, prof_varations in profanities.items():
+        if len(prof_varations) > 0:
+            for prof_variant in prof_varations:
+                tweet = tweet.replace(prof_variant, profanity)
+    print('Fuzzy Returns:', tweet)
     return tweet, matches
         if any(x in word for x in laugh_texts):
             row_split[index] = 'haha'
     # Combine list of words back to sentence
     combined_text = ' '.join(filter(None, row_split))
 def predict(tweet):
+    fuzzy_text, matches = fuzzy_lookup(tweet)
+    output = query(preprocess(fuzzy_text))
     if 'error' in output:
         return output['error'], 'Error occured. Try again later.', {"error": "error"}
         if predicted_label == 'Abusive':
             for base_word, _ in matches.items():
                 tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
             return output, tweet, json.dumps(matches)
         else:
             return output, tweet, json.dumps(matches)
 hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')

lookup_words.txt CHANGED Viewed

@@ -152,4 +152,5 @@ kang
 bubuka
 buka
 talaga
 g@g0

 bubuka
 buka
 talaga
+tuloy
 g@g0