Spaces:

SuperBigtoo
/

ThaiNewsClassify

Sleeping

App Files Files Community

SuperBigtoo commited on Oct 21, 2023

Commit

833997d

•

1 Parent(s): 93acf27

commit

Browse files

Files changed (14) hide show

__pycache__/thai_tokenization.cpython-311.pyc +0 -0
app.py +27 -5
model/thainewsClassify_model_3_14/config.json +59 -0
model/thainewsClassify_model_3_14/model_args.json +1 -0
model/thainewsClassify_model_3_14/pytorch_model.bin +3 -0
model/thainewsClassify_model_3_14/special_tokens_map.json +37 -0
model/thainewsClassify_model_3_14/tokenizer.json +0 -0
model/thainewsClassify_model_3_14/tokenizer_config.json +65 -0
model/thainewsClassify_model_3_14/training_args.bin +3 -0
model/thainewsClassify_model_3_14/vocab.txt +0 -0
requirements.txt +6 -0
th.wiki.bpe.op25000.model +3 -0
th.wiki.bpe.op25000.vocab +0 -0
thai_tokenization.py +87 -0

__pycache__/thai_tokenization.cpython-311.pyc ADDED Viewed

Binary file (5.04 kB). View file

app.py CHANGED Viewed

@@ -1,12 +1,34 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
 iface = gr.Interface(
-        fn=greet,
         inputs=gr.Textbox(lines=1, max_lines=10, label="Input News's Title"),
-        outputs=gr.Textbox(lines=1, max_lines=10, label="Predicted News's Type"),
         title="Thai News Classify",
         examples=["จบสกอร์ไม่คม หมดครึ่งแรก ยูเครน เจ๊า โปแลนด์ 0-0",
                   "แอรินยินดีนาฑี มีรักใหม่ ยันจบกันด้วยดี ปัดถ่ายแฟชั่นเซ็กซี่ประชดรัก อ้างถูกใจคอนเซปต์ (คลิป)",

 import gradio as gr
+import torch
+from simpletransformers.classification import ClassificationModel
+from pythainlp import sent_tokenize
+from thai_tokenization import ThaiTokenizer
+tokenizer = ThaiTokenizer(vocab_file='ThaiNewsClassify/th.wiki.bpe.op25000.vocab', spm_file='ThaiNewsClassify/th.wiki.bpe.op25000.model')
+typeId = {'การเมือง': 0, 'กีฬา': 1, 'คุณภาพชีวิต': 2, 'ทั่วไทย': 3, 'ไลฟ์สไตล์': 4,
+            'อื่นๆ': 5, 'อาชญากรรม': 6, 'สิ่งแวดล้อม': 7, 'บันเทิง & วัฒนธรรม': 8, 'เศรษฐกิจ': 9,
+            'วิทยาศาสตร์ & การศึกษา': 10, 'สังคม': 11, 'unspecified': 12, 'ต่างประเทศ': 13}
+loaded_model = ClassificationModel(
+     "bert",
+     "ThaiNewsClassify/model/thainewsClassify_model_3_14",
+     use_cuda=torch.cuda.is_available(),
+     num_labels=14,
+)
+def predict_type(title_input):
+    title_input = title_input.lower()
+    title_input = sent_tokenize(title_input)
+    title_input = ' '.join(tokenizer.tokenize(' '.join(title_input)))
+    predictions, raw_outputs = loaded_model.predict([title_input])
+    predicted_label_name = [type_name for type_name, type_id in typeId.items() if type_id == predictions[0]]
+    return f"Predicted News Type: {predicted_label_name[0]}"
 iface = gr.Interface(
+        fn=predict_type,
         inputs=gr.Textbox(lines=1, max_lines=10, label="Input News's Title"),
+        outputs=gr.Textbox(lines=1, max_lines=2, label="Predicted News's Type"),
         title="Thai News Classify",
         examples=["จบสกอร์ไม่คม หมดครึ่งแรก ยูเครน เจ๊า โปแลนด์ 0-0",
                   "แอรินยินดีนาฑี มีรักใหม่ ยันจบกันด้วยดี ปัดถ่ายแฟชั่นเซ็กซี่ประชดรัก อ้างถูกใจคอนเซปต์ (คลิป)",

model/thainewsClassify_model_3_14/config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "_name_or_path": "ThaiNewsClassify/model/thainewsClassify_model_3_14",
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "embedding_size": 768,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_11": 11,
+    "LABEL_12": 12,
+    "LABEL_13": 13,
+    "LABEL_2": 2,
+    "LABEL_3": 3,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 25004
+}

model/thainewsClassify_model_3_14/model_args.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"adafactor_beta1": null, "adafactor_clip_threshold": 1.0, "adafactor_decay_rate": -0.8, "adafactor_eps": [1e-30, 0.001], "adafactor_relative_step": true, "adafactor_scale_parameter": true, "adafactor_warmup_init": true, "adam_betas": [0.9, 0.999], "adam_epsilon": 1e-08, "best_model_dir": "outputs/best_model", "cache_dir": "cache_dir/", "config": {}, "cosine_schedule_num_cycles": 0.5, "custom_layer_parameters": [], "custom_parameter_groups": [{"lr": 0.01}], "dataloader_num_workers": 0, "do_lower_case": false, "dynamic_quantize": false, "early_stopping_consider_epochs": false, "early_stopping_delta": 0, "early_stopping_metric": "eval_loss", "early_stopping_metric_minimize": true, "early_stopping_patience": 3, "encoding": null, "eval_batch_size": 64, "evaluate_during_training": false, "evaluate_during_training_silent": true, "evaluate_during_training_steps": 2000, "evaluate_during_training_verbose": false, "evaluate_each_epoch": true, "fp16": false, "gradient_accumulation_steps": 1, "learning_rate": 4e-05, "local_rank": -1, "logging_steps": 50, "loss_type": null, "loss_args": {}, "manual_seed": null, "max_grad_norm": 1.0, "max_seq_length": 128, "model_name": "/content/models/simple_transformer/thainewsClassify_model_2_14", "model_type": "bert", "multiprocessing_chunksize": -1, "n_gpu": 1, "no_cache": true, "no_save": false, "not_saved_args": [], "num_train_epochs": 3, "optimizer": "AdamW", "output_dir": "/content/models/simple_transformer/thainewsClassify_model_2_14", "overwrite_output_dir": true, "polynomial_decay_schedule_lr_end": 1e-07, "polynomial_decay_schedule_power": 1.0, "process_count": 1, "quantized_model": false, "reprocess_input_data": true, "save_best_model": true, "save_eval_checkpoints": true, "save_model_every_epoch": false, "save_optimizer_and_scheduler": true, "save_steps": -1, "scheduler": "linear_schedule_with_warmup", "silent": false, "skip_special_tokens": true, "tensorboard_dir": null, "thread_count": null, "tokenizer_name": "/content/models/simple_transformer/thainewsClassify_model_2_14", "tokenizer_type": null, "train_batch_size": 64, "train_custom_parameters_only": false, "use_cached_eval_features": false, "use_early_stopping": false, "use_hf_datasets": false, "use_multiprocessing": false, "use_multiprocessing_for_evaluation": false, "wandb_kwargs": {}, "wandb_project": null, "warmup_ratio": 0.06, "warmup_steps": 707, "weight_decay": 0.0, "model_class": "ClassificationModel", "labels_list": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "labels_map": {}, "lazy_delimiter": "\t", "lazy_labels_column": 1, "lazy_loading": false, "lazy_loading_start_line": 1, "lazy_text_a_column": null, "lazy_text_b_column": null, "lazy_text_column": 0, "onnx": false, "regression": false, "sliding_window": false, "special_tokens_list": [], "stride": 0.8, "tie_value": 1}

model/thainewsClassify_model_3_14/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dad22f227d95fc5f151dcbe803e6a728c00e35558c2c60fccb30cf30de21a93
+size 421089518

model/thainewsClassify_model_3_14/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

model/thainewsClassify_model_3_14/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/thainewsClassify_model_3_14/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "lowercase": false,
+  "mask_token": "[MASK]",
+  "max_length": 128,
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": false,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "<unk>"
+}

model/thainewsClassify_model_3_14/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69373dbab9d89dd72fa6822e7fddc57d5408a32d55680d343b993634892f0f34
+size 3768

model/thainewsClassify_model_3_14/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+tensorflow
+transformers
+simpletransformers
+pythainlp
+numpy

th.wiki.bpe.op25000.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c58c571078266e44a63d151ee1a14c7f3c4adfdf44b3282f21a0d7bc2b97a1d
+size 926663

th.wiki.bpe.op25000.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

thai_tokenization.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import collections
+import unicodedata
+import six
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+def load_vocab(vocab_file):
+  vocab = collections.OrderedDict()
+  index = 0
+  with open(vocab_file, "r", encoding='utf-8') as reader:
+    while True:
+      token = reader.readline()
+      if token.split(): token = token.split()[0] # to support SentencePiece vocab file
+      token = convert_to_unicode(token)
+      if not token:
+        break
+      token = token.strip()
+      vocab[token] = index
+      index += 1
+  return vocab
+#####
+from bert.bpe_helper import BPE
+import sentencepiece as spm
+def convert_by_vocab(vocab, items):
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+class ThaiTokenizer(object):
+  """Tokenizes Thai texts."""
+  def __init__(self, vocab_file, spm_file):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.bpe = BPE(vocab_file)
+    self.s = spm.SentencePieceProcessor()
+    self.s.Load(spm_file)
+  def tokenize(self, text):
+    bpe_tokens = self.bpe.encode(text).split(' ')
+    spm_tokens = self.s.EncodeAsPieces(text)
+    tokens = bpe_tokens if len(bpe_tokens) < len(spm_tokens) else spm_tokens
+    split_tokens = []
+    for token in tokens:
+      new_token = token
+      if token.startswith('_') and not token in self.vocab:
+        split_tokens.append('_')
+        new_token = token[1:]
+      if not new_token in self.vocab:
+        split_tokens.append('<unk>')
+      else:
+        split_tokens.append(new_token)
+    return split_tokens
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)