Spaces:

SuperBigtoo
/

ThaiNewsClassify

Running

File size: 3,648 Bytes

a31bbda
833997d
dc929b8
833997d
 
 
42d0b45
833997d
42d0b45
 
 
 
 
 
 
833997d
 
 
 
 
 
42d0b45
 
833997d
 
 
 
 
 
df74a6d
833997d
 
 
 
9d1cd5c
a31bbda
f1293be
833997d
99819ca
833997d
99819ca
4e652d8
 
 
 
 
 
b649cf6
 
99819ca
f1293be
680ded3
 
f1293be

import gradio as gr
import torch
import re
from simpletransformers.classification import ClassificationModel
from pythainlp import sent_tokenize
from thai_tokenization import ThaiTokenizer
from transformers import AutoTokenizer

#tokenizer = ThaiTokenizer(vocab_file='th.wiki.bpe.op25000.vocab', spm_file='th.wiki.bpe.op25000.model')

#create tokenizer
tokenizer = AutoTokenizer.from_pretrained(
                'airesearch/wangchanberta-base-att-spm-uncased',
                revision='main',
                model_max_length=416,)

typeId = {'การเมือง': 0, 'กีฬา': 1, 'คุณภาพชีวิต': 2, 'ทั่วไทย': 3, 'ไลฟ์สไตล์': 4,
            'อื่นๆ': 5, 'อาชญากรรม': 6, 'สิ่งแวดล้อม': 7, 'บันเทิง & วัฒนธรรม': 8, 'เศรษฐกิจ': 9,
            'วิทยาศาสตร์ & การศึกษา': 10, 'สังคม': 11, 'unspecified': 12, 'ต่างประเทศ': 13}

loaded_model = ClassificationModel(
     "camembert",
     "SuperBigtoo/thainews-classification-wangchanberta",
     use_cuda=torch.cuda.is_available(),
     num_labels=14,
)

def predict_type(title_input):
    title_input = title_input.lower()
    title_input = re.sub(r'[-:‘’“”.,%/()\[\]\'"!;\\/]', '', title_input)
    title_input = sent_tokenize(title_input)
    title_input = ' '.join(tokenizer.tokenize(' '.join(title_input)))
    predictions, raw_outputs = loaded_model.predict([title_input])
    predicted_label_name = [type_name for type_name, type_id in typeId.items() if type_id == predictions[0]]
    return predicted_label_name[0]

iface = gr.Interface(
        fn=predict_type,
        inputs=gr.Textbox(lines=1, max_lines=10, label="Input News's Title"),
        outputs=gr.Textbox(lines=1, max_lines=2, label="Predicted News's Type"),
        title="Thai News Classify",
        examples=['จบสกอร์ไม่คม หมดครึ่งแรก ยูเครน เจ๊า โปแลนด์ 0-0',
                  'แอรินยินดีนาฑี มีรักใหม่ ยันจบกันด้วยดี ปัดถ่ายแฟชั่นเซ็กซี่ประชดรัก อ้างถูกใจคอนเซปต์ (คลิป)',
                  'แนวโน้ม ราคาทอง ปี 63 ไตรมาสแรกส่อลงแรงก่อนทะยานขึ้นอาจเห็นบาทละ 23750',
                  'ปล้นรถ ปล้นปืน เผย 8 ศพกระบี่ เป็นไปได้ทีมฆ่าถูกสั่งตรงจากชายแดนใต้',
                  'อินโดฯ จัดกิจกรรมศาสนา ไม่สนโควิด-19 หวั่นซ้ำรอยมาเลเซีย',
                  'วงคุยว่าด้วย ชาติที่เรา (จะ) รักของ นิธิผ่านนักวิชาการ 3 รุ่น หลังเลือกตั้ง',
                  'ฆ่าโหด "น้องชายจุฬาราชมนตรี" หนุ่มใหญ่หลอนยา ชักมีดฟัน-แทงยับ',
                  'กู้เงินออมสิน 10,000-300,000 บาท ผ่านออนไลน์ ดอกเบี้ยต่ำใช่เหรอ'
                 ]
)       

if __name__ == "__main__":
    iface.launch()