Spaces:

truong-xuan-linh
/

content-category-classification

Runtime error

App Files Files Community

truong-xuan-linh commited on Sep 7, 2023

Commit

7d5dab0

•

1 Parent(s): abc1c82

init

Browse files

Files changed (10) hide show

.github/workflows/main.yml +1 -1
.gitignore +1 -0
README.md +11 -0
app.py +46 -0
config/classes.json +31 -0
config/config.yaml +5 -0
linhai.jpeg +0 -0
requirements.txt +6 -0
src/category_model.py +113 -0
test.ipynb +171 -0

.github/workflows/main.yml CHANGED Viewed

@@ -17,4 +17,4 @@ jobs:
       - name: Push to hub
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: git push https://truong-xuan-linh:$HF_TOKEN@huggingface.co/spaces/truong-xuan-linh/content-category-classification master

       - name: Push to hub
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://truong-xuan-linh:$HF_TOKEN@huggingface.co/spaces/truong-xuan-linh/content-category-classification main

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

	@@ -1 +1,12 @@











1	# content_category_classification

+---
+title: Content Category Classification
+emoji: 🏆
+colorFrom: pink
+colorTo: purple
+sdk: streamlit
+sdk_version: 1.26.0
+app_file: app.py
+pinned: false
+---
 # content_category_classification

app.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import streamlit as st
+from omegaconf import OmegaConf
+#Trick to not init function multitime
+if "category_model" not in st.session_state:
+    print("INIT MODEL")
+    from src.category_model import CategoryModel
+    from src.category_model import PhoBERT_classification
+    src_config = OmegaConf.load('config/config.yaml')
+    st.session_state.category_model = CategoryModel(config=src_config)
+    print("DONE INIT MODEL")
+st.set_page_config(page_title="Vietnamese Category Classification", layout="wide", page_icon = "./linhai.jpeg")
+hide_menu_style = """
+<style>
+footer {visibility: hidden;}
+</style>
+"""
+st.markdown(hide_menu_style, unsafe_allow_html= True)
+st.markdown(
+    """
+    <style>
+    [data-testid="stSidebar"][aria-expanded="true"] > div:first-child{
+        width: 400px;
+    }
+    [data-testid="stSidebar"][aria-expanded="false"] > div:first-child{
+        margin-left: -400px;
+    }
+    """,
+    unsafe_allow_html=True,
+)
+st.markdown("<h2 style='text-align: center; color: grey;'>Input: Vietnamese content</h2>", unsafe_allow_html=True)
+st.markdown("<h2 style='text-align: center; color: grey;'>Output: Content classification</h2>", unsafe_allow_html=True)
+content = st.text_input("Enter your content", value="The length of the sentence must be greater than 50.")
+if st.button("Submit"):
+    st.write("**RESULT:** ")
+    if len(content.split()) < 50:
+        st.write("The length of the sentence must be greater than 50.")
+    else:
+        result = st.session_state.category_model.predict(content)
+        st.write(result)

config/classes.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{"music": 0,
+ "food": 1,
+ "technology": 2,
+ "travel": 3,
+ "animal": 4,
+ "life": 5,
+ "family": 6,
+ "entertainment": 7,
+ "education": 8,
+ "youth": 9,
+ "fun": 10,
+ "cartoon": 11,
+ "science": 12,
+ "economy": 13,
+ "history": 14,
+ "shopping": 15,
+ "celebrity": 16,
+ "law": 17,
+ "movie": 18,
+ "book": 19,
+ "beauty": 20,
+ "health": 21,
+ "world": 22,
+ "sports": 23,
+ "nature": 24,
+ "news": 25,
+ "fashion": 26,
+ "game": 27,
+ "culture": 28,
+ "vehicles": 29,
+ "medical": 30}

config/config.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+model:
+  path: ./models/freeze_clean_warnup_0.0005_0.6648_2.8259.pt
+  url: https://drive.google.com/uc?id=1gKBx1sgHhJOyLmCidCm_serwgDYG6U1g
+  theshold: 0.3
+  min_length: 50

linhai.jpeg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers==4.28.1
+torch==2.0.0
+gdown==4.7.1
+underthesea==6.7.0
+omegaconf==2.0.6
+streamlit==1.26.0

src/category_model.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import os
+import re
+import json
+import gdown
+import numpy as np
+import torch
+import torch.nn as nn
+from underthesea import word_tokenize
+from transformers import AutoTokenizer
+class PhoBERT_classification(nn.Module):
+  def __init__(self, phobert):
+    super(PhoBERT_classification, self).__init__()
+    self.phobert = phobert
+    self.dropout = nn.Dropout(0.2)
+    self.relu = nn.ReLU()
+    self.fc1 = nn.Linear(768, 512, device=self.DEVICE)
+    self.fc2 = nn.Linear(512, self.classes.__len__(), device=self.DEVICE)
+    self.softmax = nn.Softmax(dim=1)
+  def forward(self, input_ids, attention_mask):
+    last_hidden_states, cls_hs = self.phobert(input_ids=input_ids, \
+                                              attention_mask=attention_mask, \
+                                              return_dict=False)
+    x = self.fc1(last_hidden_states[:, 0, :])
+    x = self.relu(x)
+    x = self.dropout(x)
+    x = self.fc2(x)
+    x = self.softmax(x)
+    return x
+class CategoryModel():
+    def __init__(self, config):
+        self.DEVICE = "cpu" #torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.classes = json.load(open("./config/classes.json", "r"))
+        self.id2label = {v: k for k, v in self.classes.items()}
+        self.config = config
+        self.get_model()
+    def get_model(self):
+        self.tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
+        if not os.path.isfile(self.config.model.path):
+            gdown.download(self.config.model.url, self.config.model.path, quiet=True)
+        self.model = torch.load(self.config.model.path, map_location=self.DEVICE)
+        self.model.eval()
+    def predict(self, paragraph):
+        def clean_string(input_string):
+            # Sử dụng biểu thức chính quy để tìm và loại bỏ các ký tự không phải là chữ cái, khoảng trắng và số
+            input_string = input_string.replace("\n", " ")
+            split_string = input_string.split()
+            input_string = " ".join([text.title() if text.isupper() else text for text in split_string ])
+            cleaned_string = re.sub(r'[^\w\s]', '', input_string)
+            return cleaned_string
+        def input_tokenizer(text):
+            text = clean_string(text)
+            segment_text = word_tokenize(text, format="text")
+            tokenized_text = self.tokenizer(segment_text, \
+                                        padding="max_length", \
+                                        truncation=True, \
+                                        max_length=256, \
+                                        return_tensors="pt")
+            tokenized_text = {k: v.to(self.DEVICE) for k, v in tokenized_text.items()}
+            return tokenized_text
+        def get_top_acc(predictions, thre):
+            results = {}
+            indexes = np.where(predictions[0] > thre)[0]
+            for index in indexes:
+                results[self.id2label[index]] = float(predictions[0][index])
+            results = {k: v for k, v in sorted(results.items(), key=lambda item: item[1], reverse=True)}
+            return results
+        tokenized_text = input_tokenizer(paragraph)
+        input_ids = tokenized_text["input_ids"]
+        token_type_ids = tokenized_text["token_type_ids"]
+        attention_mask = tokenized_text["attention_mask"]
+        with torch.no_grad():
+            logits = self.model(input_ids, attention_mask)
+        results = get_top_acc(logits.cpu().numpy(), self.config.model.theshold)
+        results_arr = []
+        for rs in results:
+            results_arr.append({
+                "category": rs,
+                "score": results[rs]
+            })
+        return results_arr
+# if __name__ == '__main__':
+#     src_config = OmegaConf.load('config/config.yaml')
+#     CategoryModel = CategoryModel(config=src_config)
+#     result = CategoryModel.predict('''''')
+#     print(result)

test.ipynb ADDED Viewed

	@@ -0,0 +1,171 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from omegaconf import OmegaConf\n",
+    "from src.category_model import CategoryModel\n",
+    "from src.category_model import PhoBERT_classification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "VnCoreNLP model folder . already exists! Please load VnCoreNLP from this folder!\n",
+      "2023-09-07 13:26:04 INFO  WordSegmenter:24 - Loading Word Segmentation model\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'fun': 0.9741222262382507}\n"
+     ]
+    }
+   ],
+   "source": [
+    "src_config = OmegaConf.load('config/config.yaml')\n",
+    "CategoryModel = CategoryModel(config=src_config)\n",
+    "\n",
+    "result = CategoryModel.predict('''''')\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting underthesea\n",
+      "  Obtaining dependency information for underthesea from https://files.pythonhosted.org/packages/c2/08/f8827734caf4fee1642bb08129afca92579633d8f72fbf0bc2f9a73aa69c/underthesea-6.7.0-py3-none-any.whl.metadata\n",
+      "  Downloading underthesea-6.7.0-py3-none-any.whl.metadata (14 kB)\n",
+      "Requirement already satisfied: Click>=6.0 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from underthesea) (8.1.7)\n",
+      "Collecting python-crfsuite>=0.9.6 (from underthesea)\n",
+      "  Using cached python_crfsuite-0.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)\n",
+      "Collecting nltk (from underthesea)\n",
+      "  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
+      "Requirement already satisfied: tqdm in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from underthesea) (4.66.1)\n",
+      "Requirement already satisfied: requests in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from underthesea) (2.31.0)\n",
+      "Collecting joblib (from underthesea)\n",
+      "  Obtaining dependency information for joblib from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2-py3-none-any.whl.metadata\n",
+      "  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)\n",
+      "Collecting scikit-learn (from underthesea)\n",
+      "  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/d4/61/966d3238f6cbcbb13350d31bd0accfc5efdf9e349cd2a42d9761b8b67a18/scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n",
+      "  Downloading scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
+      "Requirement already satisfied: PyYAML in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from underthesea) (6.0.1)\n",
+      "Collecting underthesea-core==1.0.4 (from underthesea)\n",
+      "  Obtaining dependency information for underthesea-core==1.0.4 from https://files.pythonhosted.org/packages/ab/09/63b71ed80c7c9f31f53297fede1345cafd5323debde4afb0ddbca8b2d800/underthesea_core-1.0.4-cp39-cp39-manylinux2010_x86_64.whl.metadata\n",
+      "  Downloading underthesea_core-1.0.4-cp39-cp39-manylinux2010_x86_64.whl.metadata (1.7 kB)\n",
+      "Requirement already satisfied: regex>=2021.8.3 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from nltk->underthesea) (2023.8.8)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from requests->underthesea) (3.2.0)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from requests->underthesea) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from requests->underthesea) (2.0.4)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from requests->underthesea) (2023.7.22)\n",
+      "Requirement already satisfied: numpy>=1.17.3 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from scikit-learn->underthesea) (1.25.2)\n",
+      "Collecting scipy>=1.5.0 (from scikit-learn->underthesea)\n",
+      "  Obtaining dependency information for scipy>=1.5.0 from https://files.pythonhosted.org/packages/a3/d3/f88285098505c8e5d141678a24bb9620d902c683f11edc1eb9532b02624e/scipy-1.11.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n",
+      "  Using cached scipy-1.11.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (59 kB)\n",
+      "Collecting threadpoolctl>=2.0.0 (from scikit-learn->underthesea)\n",
+      "  Obtaining dependency information for threadpoolctl>=2.0.0 from https://files.pythonhosted.org/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b712eea58ca779a72865/threadpoolctl-3.2.0-py3-none-any.whl.metadata\n",
+      "  Using cached threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)\n",
+      "Downloading underthesea-6.7.0-py3-none-any.whl (20.9 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.9/20.9 MB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0mm\n",
+      "\u001b[?25hUsing cached underthesea_core-1.0.4-cp39-cp39-manylinux2010_x86_64.whl (657 kB)\n",
+      "Using cached joblib-1.3.2-py3-none-any.whl (302 kB)\n",
+      "Using cached scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)\n",
+      "Using cached scipy-1.11.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.5 MB)\n",
+      "Using cached threadpoolctl-3.2.0-py3-none-any.whl (15 kB)\n",
+      "\u001b[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n",
+      "\u001b[0mInstalling collected packages: underthesea-core, python-crfsuite, threadpoolctl, scipy, joblib, scikit-learn, nltk, underthesea\n",
+      "Successfully installed joblib-1.3.2 nltk-3.8.1 python-crfsuite-0.9.9 scikit-learn-1.3.0 scipy-1.11.2 threadpoolctl-3.2.0 underthesea-6.7.0 underthesea-core-1.0.4\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install underthesea\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Chàng trai 9X Quảng_Trị khởi_nghiệp từ nấm sò'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from underthesea import word_tokenize\n",
+    "sentence = \"Chàng trai 9X Quảng Trị khởi nghiệp từ nấm sò\"\n",
+    "\n",
+    "word_tokenize(sentence, format=\"text\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "vietnamese_ocr",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.0"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}