Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

mginoben commited on Apr 23, 2023

Commit

40a4fcd

•

1 Parent(s): 6e30a48

Added english words lookup

Files changed (2) hide show

app.py CHANGED Viewed

@@ -7,7 +7,9 @@ from thefuzz import process, fuzz
 import numpy as np
 import re
 from string import punctuation
-import time
 API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
@@ -36,8 +38,8 @@ contractions = read_text('contractions', 'json')
 lookup_words = read_text('lookup_words')
 obj_pronouns = read_text('obj_pronouns')
 profanities = read_text('profanities', 'json')
-loading_countdown = 0
 def fuzzy_lookup(tweet):
@@ -45,6 +47,8 @@ def fuzzy_lookup(tweet):
     lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
     for word in tweet.split():
         scores = []
         matched_words = []
         word = word.strip(punctuation)
@@ -121,7 +125,6 @@ def preprocess(tweet):
 def predict(tweet):
-    global loading_countdown
     preprocessed_tweet, matched_profanity = preprocess(tweet)

 import numpy as np
 import re
 from string import punctuation
+import nltk
+nltk.download('words')
+from nltk.corpus import words
 API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
 lookup_words = read_text('lookup_words')
 obj_pronouns = read_text('obj_pronouns')
 profanities = read_text('profanities', 'json')
+eng_words = set(words.words())
 def fuzzy_lookup(tweet):
     lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
     for word in tweet.split():
+        if word in eng_words:
+            break
         scores = []
         matched_words = []
         word = word.strip(punctuation)
 def predict(tweet):
     preprocessed_tweet, matched_profanity = preprocess(tweet)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 emoji
 thefuzz[speedup]
 numpy

 emoji
 thefuzz[speedup]
 numpy
+nltk