Spaces:

thealper2
/

turkish-hate-speech

Runtime error

App Files Files Community

thealper2 commited on May 4, 2023

Commit

c4b9239

•

1 Parent(s): 68db425

Upload 2 files

Browse files

Files changed (2) hide show

app.py +59 -3
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -2,8 +2,37 @@ import gradio as gr
 import pandas as pd
 import torch
 import os
 from transformers import BertTokenizer, BertModel
 class BertClassifier(torch.nn.Module):
   def __init__(self, dropout=0.5):
     super(BertClassifier, self).__init__()
@@ -35,7 +64,9 @@ class BertClassifier(torch.nn.Module):
 model = BertClassifier()
 tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
 model.load_state_dict(torch.load('tubitak2.pt', map_location=torch.device('cpu')))
 def predict_text(model, sentence):
   device = torch.device("cpu")
   #model = model.cuda()
@@ -68,10 +99,35 @@ def predict_text(model, sentence):
     # Kategorik sınıfı döndür.
     return categories.get(output.argmax(dim=1).item())
 def predict(df):
     # TODO:
-    df['text'] = df['text'].apply(preprocess_text)
     for i in range(len(df)):
       df.loc[i, 'label'] = predict_text(model, df['text'][i])

 import pandas as pd
 import torch
 import os
+import re
+import string
+import ntlk
+import emoji
+from nltk.corpus import stopwords
 from transformers import BertTokenizer, BertModel
+nltk.download('stopwords')
+stop_words_list = stopwords.words('turkish')
+# Ön işleme adımlarını yapmak için fonksiyonumuzu tanımlıyoruz.
+def preprocess_text(text):
+    # Küçük harflere çevirme
+    text = text.lower()
+    # Satır sonu karakterlerini kaldırma
+    text = re.sub(r'\n', ' ', text)
+    # Rakamları kaldırma
+    text = re.sub(r'\d', '', text)
+    # Noktalama işaretlerini kaldırma
+    text = text.translate(str.maketrans("", "", string.punctuation))
+    # Stop-words'leri kaldırma
+    words = text.split()
+    words = [word for word in words if not word in stop_words_list]
+    # Tekrarlanan karakterlerin kaldırılması
+    words = [re.sub(r'(.)\1{1,}', r'\1\1', word) for word in words]
+    # Tekrarlanan boşlukların kaldırılması
+    words = [word.strip() for word in words if len(word.strip()) > 1]
+    text = " ".join(words)
+    return text
 class BertClassifier(torch.nn.Module):
   def __init__(self, dropout=0.5):
     super(BertClassifier, self).__init__()
 model = BertClassifier()
 tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
 model.load_state_dict(torch.load('tubitak2.pt', map_location=torch.device('cpu')))
 def predict_text(model, sentence):
   device = torch.device("cpu")
   #model = model.cuda()
     # Kategorik sınıfı döndür.
     return categories.get(output.argmax(dim=1).item())
+import re
+# Ön işleme adımlarını yapmak için fonksiyonumuzu tanımlıyoruz.
+def preprocess_text(text):
+    # Küçük harflere çevirme
+    text = text.lower()
+    # Satır sonu karakterlerini kaldırma
+    text = re.sub(r'\n', ' ', text)
+    # Rakamları kaldırma
+    text = re.sub(r'\d', '', text)
+    # Noktalama işaretlerini kaldırma
+    import string
+    text = text.translate(str.maketrans("", "", string.punctuation))
+    # Stop-words'leri kaldırma
+    words = text.split()
+    words = [word for word in words if not word in stop_words_list]
+    # Tekrarlanan karakterlerin kaldırılması
+    words = [re.sub(r'(.)\1{1,}', r'\1\1', word) for word in words]
+    # Tekrarlanan boşlukların kaldırılması
+    words = [word.strip() for word in words if len(word.strip()) > 1]
+    text = " ".join(words)
+    return text
 def predict(df):
     # TODO:
+    regex = r'@\w+\s?'
+    df['clean_text'] = df['text'].apply(lambda x: re.sub(regex, '', x))
+    df['clean_text'] = df['clean_text'].apply(preprocess_text)
     for i in range(len(df)):
       df.loc[i, 'label'] = predict_text(model, df['text'][i])

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
 gradio==3.28.3
 pandas==1.5.3
 torch==2.0.0
 transformers==4.27.2

+emoji==2.2.0
 gradio==3.28.3
+nltk==3.8
 pandas==1.5.3
 torch==2.0.0
 transformers==4.27.2