Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

App Files Files Community

mginoben commited on Mar 4, 2023

Commit

34fbcfb

•

1 Parent(s): 595931f

Add application file

Browse files

Files changed (1) hide show

app.py +161 -0

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import gradio as gr
+import pandas as pd
+import requests
+import emoji
+import re
+API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
+headers = {"Authorization": "Bearer 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"}
+profanities = ['bobo', 'bobong', 'bwiset', 'bwisit', 'buwisit', 'buwiset', 'bwesit', 'gago', 'gagong', 'kupal',
+               'pakshet', 'pakyu', 'pucha', 'puchang',
+               'punyeta', 'punyetang', 'puta', 'putang', 'putangina', 'putanginang', 'tanga', 'tangang', 'tangina',
+               'tanginang', 'tarantado', 'tarantadong', 'ulol']
+contractions = {
+    'di': 'hindi',
+    'to': 'ito',
+    'no': 'ano',
+    'kundi': 'kung hindi',
+    'nya': 'niya',
+    'nyo': 'ninyo',
+    'niyo': 'ninyo',
+    'pano': 'paano',
+    'sainyo': 'sa inyo',
+    'sayo': 'sa iyo',
+    'pag': 'kapag',
+    'kesa': 'kaysa',
+    'dun': 'doon',
+    'ganto': 'ganito',
+    'nandun': 'nandoon',
+    'saka': 'tsaka',
+    'ung': 'yung',
+    'wag': 'huwag',
+    'sya': 'siya',
+    'bat': 'bakit',
+    'yon': 'iyon',
+    'yun': 'iyon',
+    'dyan': 'diyan',
+    'jan': 'diyan',
+    'andito': 'nandito',
+    'tanginamo': 'tangina mo',
+    'putanginamo': 'putangina mo',
+    'san': 'saan',
+    'ganun': 'ganoon',
+    'gagong': 'gago na',
+    'bobong': 'bobo na',
+    'tangang': 'tanga na',
+    'kelan': 'kailan',
+    'raw': 'daw',
+    'tanginang': 'tangina na',
+    'tarantadong': 'tarantado na',
+    'putang ina': 'putangina',
+    'putang inang': 'putangina',
+    'putanginang': 'putangina',
+    'itong': 'ito ang',
+    'lng': 'lang',
+    'bwisit': 'bwiset',
+    'bwesit': 'bwiset',
+    'buwisit': 'bwiset',
+    'buwesit': 'bwiset'
+}
+def preprocess(row):
+    laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
+    symbols = ['@', '#']
+    # Lowercase
+    row = row.lower()
+    # Remove emojis
+    row = emoji.replace_emoji(row, replace='')
+    # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
+    row = re.sub(r'(.)\1{2,}', r'\1', row)
+    # Split sentence into list of words
+    row_split = row.split()
+    for index, word in enumerate(row_split):
+        # Remove words with symbols (e.g. @username, #hashtags)
+        if any(x in word for x in symbols):
+            row_split[index] = ''
+        # Remove links
+        if 'http' in word:
+            row_split[index] = ''
+        # Unify laugh texts format to 'haha'
+        if any(x in word for x in laugh_texts):
+            row_split[index] = 'haha'
+        # Remove words with digits (4ever)
+        if any(x.isdigit() for x in word):
+            row_split[index] = ''
+    # Combine list of words back to sentence
+    combined_text = ' '.join(filter(None, row_split))
+    # Check if output contains single word then return null
+    if len(combined_text.split()) == 1:
+        return combined_text
+    # Filter needed characters
+    combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text)
+    # Expand Contractions
+    for i in contractions.items():
+        combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text)
+    return combined_text
+def query(payload):
+    response = requests.post(API_URL, headers=headers, json=payload)
+    return response.json()
+def predict(text):
+    print(preprocess(text))
+    output = query(preprocess(text))[0]
+    print(output)
+    output = [tuple(i.values()) for i in output]
+    output = dict((x, y) for x, y in output)
+    predicted_label = list(output.keys())[0]
+    if predicted_label == 'Abusive':
+        output_text = text
+        for i in profanities:
+            compiled = re.compile(re.escape(i), re.IGNORECASE)
+            output_text = compiled.sub('****', output_text)
+        return output, output_text
+    else:
+        return output, text
+hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
+demo = gr.Interface(
+    fn=predict,
+    inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
+    outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
+             gr.components.Text(label='OUTPUT')],
+    examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
+              'Napakainit ngayong araw pakshet namaaan!!',
+              'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
+              'Bobo ka ba? napakadali lang nyan eh... 🤡',
+              'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],
+    allow_flagging="manual",
+    flagging_callback=hf_writer,
+    flagging_options=['Good bot', 'Bad bot']
+)
+demo.launch(debug=True)