zeynepgulhan's picture
app file created
79bbdf9 verified
raw
history blame
3.1 kB
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import re
from turkish.deasciifier import Deasciifier
# Model ve tokenizer initialization
tokenizer = AutoTokenizer.from_pretrained("TURKCELL/bert-offensive-lang-detection-tr")
model = AutoModelForSequenceClassification.from_pretrained("TURKCELL/bert-offensive-lang-detection-tr")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def deasciifier(text):
deasciifier = Deasciifier(text)
return deasciifier.convert_to_turkish()
def remove_circumflex(text):
circumflex_map = {
'â': 'a',
'î': 'i',
'û': 'u',
'ô': 'o',
'Â': 'A',
'Î': 'I',
'Û': 'U',
'Ô': 'O'
}
return ''.join(circumflex_map.get(c, c) for c in text)
def turkish_lower(text):
turkish_map = {
'I': 'ı',
'İ': 'i',
'Ç': 'ç',
'Ş': 'ş',
'Ğ': 'ğ',
'Ü': 'ü',
'Ö': 'ö'
}
return ''.join(turkish_map.get(c, c).lower() for c in text)
def clean_text(text):
# Metindeki şapkalı harfleri kaldırma
text = remove_circumflex(text)
# Metni küçük harfe dönüştürme
text = turkish_lower(text)
# deasciifier
text = deasciifier(text)
# Kullanıcı adlarını kaldırma
text = re.sub(r"@\S*", " ", text)
# Hashtag'leri kaldırma
text = re.sub(r'#\S+', ' ', text)
# URL'leri kaldırma
text = re.sub(r"http\S+|www\S+|https\S+", ' ', text, flags=re.MULTILINE)
# Noktalama işaretlerini ve metin tabanlı emojileri kaldırma
text = re.sub(r'[^\w\s]|(:\)|:\(|:D|:P|:o|:O|;\))', ' ', text)
# Emojileri kaldırma
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r' ', text)
# Birden fazla boşluğu tek boşlukla değiştirme
text = re.sub(r'\s+', ' ', text).strip()
return text
def is_offensive(sentence):
normalize_text = clean_text(sentence)
test_sample = tokenizer(normalize_text, padding=True, truncation=True, max_length=256, return_tensors='pt')
test_sample = {k: v.to(device) for k, v in test_sample.items()}
output = model(**test_sample)
y_pred = np.argmax(output.logits.detach().cpu().numpy(), axis=1)
d = {0: 'non-offensive', 1: 'offensive'}
return d[y_pred[0]]
iface = gr.Interface(
fn=is_offensive,
inputs=gr.Textbox(lines=2, placeholder="Enter sentence here..."),
outputs="text",
title="Offensive Language Detection",
description="Offensive language detection for Turkish"
)
iface.launch()