|
import time |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
def get_model(): |
|
start_time = time.time() |
|
model = AutoModelForSequenceClassification.from_pretrained("TURKCELL/gibberish-detection-model-tr") |
|
tokenizer = AutoTokenizer.from_pretrained("TURKCELL/gibberish-detection-model-tr", do_lower_case=True, |
|
use_fast=True) |
|
model.to(device) |
|
print(f'bert model loading time {time.time() - start_time}') |
|
return tokenizer, model |
|
|
|
|
|
tokenizer, model = get_model() |
|
|
|
|
|
def get_result_for_one_sample(model, tokenizer, device, sample): |
|
d = { |
|
1: 'gibberish', |
|
0: 'real' |
|
} |
|
test_sample = tokenizer([sample], padding=True, truncation=True, max_length=256, return_tensors='pt').to(device) |
|
|
|
output = model(**test_sample) |
|
y_pred = np.argmax(output.logits.detach().to('cpu').numpy(), axis=1) |
|
return d[y_pred[0]] |
|
|
|
|
|
def process_sentence_with_bert(sentence): |
|
print('processing text with bert') |
|
start = time.time() |
|
result = get_result_for_one_sample(model, tokenizer, device, |
|
sentence) |
|
print(f'bert processing time {time.time() - start}') |
|
return result |
|
|
|
|
|
def classify_gibberish(sentence, ignore_words_file): |
|
|
|
result = process_sentence_with_bert(sentence) |
|
return result |
|
|
|
|
|
iface = gr.Interface(fn=classify_gibberish, |
|
inputs=[gr.Textbox(lines=2, placeholder="Enter Sentence Here..."), |
|
gr.File(label="Upload Ignore Words File")], |
|
outputs=gr.Textbox(label="Gibberish Detection Result"), |
|
title="Simple Gibberish Text Detection For Turkish", |
|
description="""Simple gibberish text detection given text like |
|
adsfdnsfnıunf |
|
sasdlsöefls.""") |
|
iface.launch() |
|
|