{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from omegaconf import OmegaConf\n", "from src.category_model import CategoryModel\n", "from src.category_model import PhoBERT_classification" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "VnCoreNLP model folder . already exists! Please load VnCoreNLP from this folder!\n", "2023-09-07 13:26:04 INFO WordSegmenter:24 - Loading Word Segmentation model\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'fun': 0.9741222262382507}\n" ] } ], "source": [ "src_config = OmegaConf.load('config/config.yaml')\n", "CategoryModel = CategoryModel(config=src_config)\n", "\n", "result = CategoryModel.predict('''''')\n", "print(result)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting underthesea\n", " Obtaining dependency information for underthesea from https://files.pythonhosted.org/packages/c2/08/f8827734caf4fee1642bb08129afca92579633d8f72fbf0bc2f9a73aa69c/underthesea-6.7.0-py3-none-any.whl.metadata\n", " Downloading underthesea-6.7.0-py3-none-any.whl.metadata (14 kB)\n", "Requirement already satisfied: Click>=6.0 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from underthesea) (8.1.7)\n", "Collecting python-crfsuite>=0.9.6 (from underthesea)\n", " Using cached python_crfsuite-0.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)\n", "Collecting nltk (from underthesea)\n", " Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)\n", "Requirement already satisfied: tqdm in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from underthesea) (4.66.1)\n", "Requirement already satisfied: requests in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from underthesea) (2.31.0)\n", "Collecting joblib (from underthesea)\n", " Obtaining dependency information for joblib from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2-py3-none-any.whl.metadata\n", " Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)\n", "Collecting scikit-learn (from underthesea)\n", " Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/d4/61/966d3238f6cbcbb13350d31bd0accfc5efdf9e349cd2a42d9761b8b67a18/scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", " Downloading scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n", "Requirement already satisfied: PyYAML in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from underthesea) (6.0.1)\n", "Collecting underthesea-core==1.0.4 (from underthesea)\n", " Obtaining dependency information for underthesea-core==1.0.4 from https://files.pythonhosted.org/packages/ab/09/63b71ed80c7c9f31f53297fede1345cafd5323debde4afb0ddbca8b2d800/underthesea_core-1.0.4-cp39-cp39-manylinux2010_x86_64.whl.metadata\n", " Downloading underthesea_core-1.0.4-cp39-cp39-manylinux2010_x86_64.whl.metadata (1.7 kB)\n", "Requirement already satisfied: regex>=2021.8.3 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from nltk->underthesea) (2023.8.8)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from requests->underthesea) (3.2.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from requests->underthesea) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from requests->underthesea) (2.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from requests->underthesea) (2023.7.22)\n", "Requirement already satisfied: numpy>=1.17.3 in /home/linh/hahalolo/storage/anaconda3/envs/vietnamese_categories_classification/lib/python3.9/site-packages (from scikit-learn->underthesea) (1.25.2)\n", "Collecting scipy>=1.5.0 (from scikit-learn->underthesea)\n", " Obtaining dependency information for scipy>=1.5.0 from https://files.pythonhosted.org/packages/a3/d3/f88285098505c8e5d141678a24bb9620d902c683f11edc1eb9532b02624e/scipy-1.11.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", " Using cached scipy-1.11.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (59 kB)\n", "Collecting threadpoolctl>=2.0.0 (from scikit-learn->underthesea)\n", " Obtaining dependency information for threadpoolctl>=2.0.0 from https://files.pythonhosted.org/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b712eea58ca779a72865/threadpoolctl-3.2.0-py3-none-any.whl.metadata\n", " Using cached threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)\n", "Downloading underthesea-6.7.0-py3-none-any.whl (20.9 MB)\n", "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.9/20.9 MB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0mm\n", "\u001b[?25hUsing cached underthesea_core-1.0.4-cp39-cp39-manylinux2010_x86_64.whl (657 kB)\n", "Using cached joblib-1.3.2-py3-none-any.whl (302 kB)\n", "Using cached scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)\n", "Using cached scipy-1.11.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.5 MB)\n", "Using cached threadpoolctl-3.2.0-py3-none-any.whl (15 kB)\n", "\u001b[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", "\u001b[0mInstalling collected packages: underthesea-core, python-crfsuite, threadpoolctl, scipy, joblib, scikit-learn, nltk, underthesea\n", "Successfully installed joblib-1.3.2 nltk-3.8.1 python-crfsuite-0.9.9 scikit-learn-1.3.0 scipy-1.11.2 threadpoolctl-3.2.0 underthesea-6.7.0 underthesea-core-1.0.4\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install underthesea\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Chàng trai 9X Quảng_Trị khởi_nghiệp từ nấm sò'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from underthesea import word_tokenize\n", "sentence = \"Chàng trai 9X Quảng Trị khởi nghiệp từ nấm sò\"\n", "\n", "word_tokenize(sentence, format=\"text\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "vietnamese_ocr", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.0" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }