Spaces:

TURKCELL
/

offensive-lang-detection-tr

Runtime error

App Files Files Community

zeynepgulhan commited on Jan 30

Commit

79bbdf9

•

1 Parent(s): cbd2f6b

app file created

Browse files

Files changed (1) hide show

app.py +101 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import numpy as np
+import re
+from turkish.deasciifier import Deasciifier
+# Model ve tokenizer initialization
+tokenizer = AutoTokenizer.from_pretrained("TURKCELL/bert-offensive-lang-detection-tr")
+model = AutoModelForSequenceClassification.from_pretrained("TURKCELL/bert-offensive-lang-detection-tr")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+def deasciifier(text):
+    deasciifier = Deasciifier(text)
+    return deasciifier.convert_to_turkish()
+def remove_circumflex(text):
+    circumflex_map = {
+        'â': 'a',
+        'î': 'i',
+        'û': 'u',
+        'ô': 'o',
+        'Â': 'A',
+        'Î': 'I',
+        'Û': 'U',
+        'Ô': 'O'
+    }
+    return ''.join(circumflex_map.get(c, c) for c in text)
+def turkish_lower(text):
+    turkish_map = {
+        'I': 'ı',
+        'İ': 'i',
+        'Ç': 'ç',
+        'Ş': 'ş',
+        'Ğ': 'ğ',
+        'Ü': 'ü',
+        'Ö': 'ö'
+    }
+    return ''.join(turkish_map.get(c, c).lower() for c in text)
+def clean_text(text):
+    # Metindeki şapkalı harfleri kaldırma
+    text = remove_circumflex(text)
+    # Metni küçük harfe dönüştürme
+    text = turkish_lower(text)
+    # deasciifier
+    text = deasciifier(text)
+    # Kullanıcı adlarını kaldırma
+    text = re.sub(r"@\S*", " ", text)
+    # Hashtag'leri kaldırma
+    text = re.sub(r'#\S+', ' ', text)
+    # URL'leri kaldırma
+    text = re.sub(r"http\S+|www\S+|https\S+", ' ', text, flags=re.MULTILINE)
+    # Noktalama işaretlerini ve metin tabanlı emojileri kaldırma
+    text = re.sub(r'[^\w\s]|(:\)|:\(|:D|:P|:o|:O|;\))', ' ', text)
+    # Emojileri kaldırma
+    emoji_pattern = re.compile("["
+                               u"\U0001F600-\U0001F64F"  # emoticons
+                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                               u"\U00002702-\U000027B0"
+                               u"\U000024C2-\U0001F251"
+                               "]+", flags=re.UNICODE)
+    text = emoji_pattern.sub(r' ', text)
+    # Birden fazla boşluğu tek boşlukla değiştirme
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def is_offensive(sentence):
+    normalize_text = clean_text(sentence)
+    test_sample = tokenizer(normalize_text, padding=True, truncation=True, max_length=256, return_tensors='pt')
+    test_sample = {k: v.to(device) for k, v in test_sample.items()}
+    output = model(**test_sample)
+    y_pred = np.argmax(output.logits.detach().cpu().numpy(), axis=1)
+    d = {0: 'non-offensive', 1: 'offensive'}
+    return d[y_pred[0]]
+iface = gr.Interface(
+    fn=is_offensive,
+    inputs=gr.Textbox(lines=2, placeholder="Enter sentence here..."),
+    outputs="text",
+    title="Offensive Language Detection",
+    description="Offensive language detection for Turkish"
+)
+iface.launch()