Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

App Files Files Community

mginoben commited on Apr 13, 2023

Commit

7a70c71

•

1 Parent(s): 21c119c

Reprogrammed app.

Browse files

Files changed (1) hide show

app.py +84 -65

app.py CHANGED Viewed

@@ -9,8 +9,8 @@ import re
 from string import punctuation
-API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
-headers = {"Authorization": "Bearer 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"}
 def read_text(filename, filetype='txt'):
     words = []
@@ -31,20 +31,34 @@ lookup_words = read_text('lookup_words')
 obj_pronouns = read_text('obj_pronouns')
 profanities = read_text('profanities', 'json')
 def fuzzy_lookup(tweet):
     lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
-    matches = dict()
     # Loop each word in tweet
     for word in tweet.split():
         # Remove punctuations
         word = word.strip(punctuation)
         # Only get digits and letters then lowercase
-        processed_word = re.sub("[^a-zA-Z0-9@]", "", word).lower()
-        scores = []
-        matched_words = []
         # If word > 4 chars
         if len(processed_word) >= 4:
             # Get fuzzy ratio
@@ -56,33 +70,30 @@ def fuzzy_lookup(tweet):
             if len(scores) > 0:
                 max_score_index = np.argmax(scores)
                 if matched_words[max_score_index] in lookup_profanity:
-                    matches[word] = matched_words[max_score_index]
-    for word, matched_profanity in matches.items():
-        word_split = word.split(matched_profanity[-2:])
         for pronoun in obj_pronouns:
             if len(word_split) > 1:
                 if pronoun == word_split[-1]:
-                    matches[word] = matched_profanity + ' ' + pronoun
                     break
     # Replace each profanities by fuzzy lookup result
-    for word, matched_profanity in matches.items():
-        tweet = tweet.replace(word, matched_profanity)
     for profanity, prof_varations in profanities.items():
         if len(prof_varations) > 0:
             for prof_variant in prof_varations:
                 tweet = tweet.replace(prof_variant, profanity)
-    return tweet, matches
 def preprocess(tweet):
-    laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
-    symbols = ['@', '#']
     # Lowercase
     tweet = tweet.lower()
@@ -97,71 +108,85 @@ def preprocess(tweet):
     for index, word in enumerate(row_split):
-        # Remove words with symbols (e.g. @username, #hashtags)
-        if any(x in word for x in symbols):
-            row_split[index] = ''
         # Remove links
         if 'http' in word:
             row_split[index] = ''
         # Unify laugh texts format to 'haha'
         if any(x in word for x in laugh_texts):
             row_split[index] = 'haha'
     # Combine list of words back to sentence
-    combined_text = ' '.join(filter(None, row_split))
     # Check if output contains single word then return null
-    if len(combined_text.split()) == 1:
-        return combined_text
-    # Filter needed characters
-    combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text)
     # Expand Contractions
     for i in contractions.items():
-        combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text)
-    return combined_text
-def query(payload):
-    response = requests.post(API_URL, headers=headers, json=payload)
-    return response.json()
 def predict(tweet):
-    fuzzy_text, matches = fuzzy_lookup(tweet)
-    processed_text = preprocess(fuzzy_text)
-    output = query(processed_text)
-    if 'error' in output:
-        return output['error'], 'Error occured. Try again later.', {}
-    elif len(matches) == 0:
-        return 'No Profanity Found.', '', {}
-    else:
-        output = [tuple(i.values()) for i in output[0]]
-        output = dict((x, y) for x, y in output)
-        predicted_label = list(output.keys())[0]
-        if predicted_label == 'Abusive':
-            # Censor
-            for base_word, _ in matches.items():
-                mask = '*' * len(base_word)
-                compiled = re.compile(re.escape(base_word), re.IGNORECASE)
-                tweet = compiled.sub(mask, tweet)
-                # tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
-            return output, tweet, json.dumps(matches)
-        else:
-            return output, tweet, json.dumps(matches)
-# output, tweet, matches = predict('ul0L Sama ng ugali mo pre Tangina uL0l!!!')
-# print(output, '\n', tweet, '\n', matches)
-hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
 demo = gr.Interface(
@@ -169,19 +194,13 @@ demo = gr.Interface(
     inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
-    outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
-             gr.components.Text(label='OUTPUT'),
-             gr.components.JSON(label='DETECTED PROFANITIES')],
     examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
               'Napakainit ngayong araw pakshet namaaan!!',
               'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
               'Bobo ka ba? napakadali lang nyan eh... 🤡',
               'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],
-    allow_flagging="manual",
-    flagging_callback=hf_writer,
-    flagging_options=['Good bot', 'Bad bot']
 )
-demo.launch()

 from string import punctuation
+API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
+headers = {"Authorization": "Bearer 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"}
 def read_text(filename, filetype='txt'):
     words = []
 obj_pronouns = read_text('obj_pronouns')
 profanities = read_text('profanities', 'json')
+def query(text):
+    text = {"inputs": text}
+    response = requests.post(API_URL, headers=headers, json=text)
+    return response.json()
+# for profanity in profanities:
+#     print(profanity, process.extractOne(profanity, tweet.split(), scorer=fuzz.ratio))
 def fuzzy_lookup(tweet):
+    matched_profanity = dict()
+    # Convert Profanity Dict to List
     lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
     # Loop each word in tweet
     for word in tweet.split():
+        scores = []
+        matched_words = []
         # Remove punctuations
         word = word.strip(punctuation)
         # Only get digits and letters then lowercase
+        processed_word = re.sub("[^a-zA-Z0-9@]", "", word)
         # If word > 4 chars
         if len(processed_word) >= 4:
             # Get fuzzy ratio
             if len(scores) > 0:
                 max_score_index = np.argmax(scores)
                 if matched_words[max_score_index] in lookup_profanity:
+                    matched_profanity[word] = matched_words[max_score_index]
+    for word, profanity in matched_profanity.items():
+        word_split = word.split(profanity[-2:])
         for pronoun in obj_pronouns:
             if len(word_split) > 1:
                 if pronoun == word_split[-1]:
+                    matched_profanity[word] = matched_profanity + ' ' + pronoun
                     break
     # Replace each profanities by fuzzy lookup result
+    for word, profanity in matched_profanity.items():
+        tweet = tweet.replace(word, profanity)
     for profanity, prof_varations in profanities.items():
         if len(prof_varations) > 0:
             for prof_variant in prof_varations:
                 tweet = tweet.replace(prof_variant, profanity)
+    return tweet, matched_profanity
 def preprocess(tweet):
     # Lowercase
     tweet = tweet.lower()
     for index, word in enumerate(row_split):
         # Remove links
         if 'http' in word:
             row_split[index] = ''
         # Unify laugh texts format to 'haha'
+        laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
         if any(x in word for x in laugh_texts):
             row_split[index] = 'haha'
     # Combine list of words back to sentence
+    preprocessed_tweet = ' '.join(filter(None, row_split))
     # Check if output contains single word then return null
+    if len(preprocessed_tweet.split()) == 1:
+        return preprocessed_tweet
     # Expand Contractions
     for i in contractions.items():
+        preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)
+    # Fuzzy Lookup
+    preprocessed_tweet, matches = fuzzy_lookup(preprocessed_tweet)
+    return preprocessed_tweet, matches
 def predict(tweet):
+    preprocessed_tweet, matched_profanity = preprocess(tweet)
+    prediction = query(preprocessed_tweet)
+    if type(prediction) is dict:
+        return "Model is still loading. Try again."
+    if bool(matched_profanity) == False:
+        return "No profanity found."
+    prediction = [tuple(i.values()) for i in prediction[0]]
+    prediction = dict((x, y) for x, y in prediction)
+    print("\n", tweet)
+    print(matched_profanity)
+    print(prediction, "\n")
+    return prediction
+# # def predict(tweet):
+# #     fuzzy_text, matches = fuzzy_lookup(tweet)
+# #     processed_text = preprocess(fuzzy_text)
+# #     output = query(processed_text)
+# #     if 'error' in output:
+# #         return output['error'], 'Error occured. Try again later.', {}
+# #     elif len(matches) == 0:
+# #         return 'No Profanity Found.', '', {}
+# #     else:
+# #         output = [tuple(i.values()) for i in output[0]]
+# #         output = dict((x, y) for x, y in output)
+# #         predicted_label = list(output.keys())[0]
+# #         if predicted_label == 'Abusive':
+# #             # Censor
+# #             for base_word, _ in matches.items():
+# #                 mask = '*' * len(base_word)
+# #                 compiled = re.compile(re.escape(base_word), re.IGNORECASE)
+# #                 tweet = compiled.sub(mask, tweet)
+# #                 # tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
+# #             return output, tweet, json.dumps(matches)
+# #         else:
+# #             return output, tweet, json.dumps(matches)
+# # # output, tweet, matches = predict('ul0L Sama ng ugali mo pre Tangina uL0l!!!')
+# # # print(output, '\n', tweet, '\n', matches)
+# # hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
 demo = gr.Interface(
     inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
+    outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION")],
     examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
               'Napakainit ngayong araw pakshet namaaan!!',
               'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
               'Bobo ka ba? napakadali lang nyan eh... 🤡',
               'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],
 )
+demo.launch()