{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc = 'как подключить модуль почту россии трекинг'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keybert import KeyBERT\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "import spacy\n",
    "nlp = spacy.load(\"ru_core_news_sm\", exclude=['tokenizer', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])\n",
    "kw_model = KeyBERT(model=nlp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string.punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "import string\n",
    "\n",
    "def tokenize_sentence(text):\n",
    "    # remove punctuation\n",
    "    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))\n",
    "    # tokenize\n",
    "    return [morph.parse(word)[0].normal_form for word in text.split()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "почта россии\n",
      "почта\n",
      "почта россии трекинг\n"
     ]
    }
   ],
   "source": [
    "vocab_raw = [\n",
    "  'почта россии', 'почта', 'почта россии трекинг',\n",
    "  'яндекс доставка', 'яндекс доставка экспресс', 'яндекс доставка express',\n",
    "  'альфабанк', 'альфа банк',\n",
    "]\n",
    "aliases = [\n",
    "  #('canonical name', ['aliases', ...])\n",
    "  ('почта россии', ['почта']),\n",
    "  ('яндекс доставка экспресс', ['яндекс доставка express']),\n",
    "  ('альфабанк', ['альфа банк']),\n",
    "]\n",
    "vocab = [\" \".join(tokenize_sentence(s)) for s in vocab_raw]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "как подключить модуль почту  россии трекинг\n",
      "как подключить модуль почту  россии трекинг\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('почта россия трекинг', 0.4786), ('почта россия', 0.3053), ('почта', 0.2357)]"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from keyphrase_vectorizers import KeyphraseCountVectorizer\n",
    "#vectorizer = KeyphraseCountVectorizer(spacy_pipeline='ru_core_news_sm', vocabulary=vocab)\n",
    "vectorizer = CountVectorizer(ngram_range=(1, 4), vocabulary=vocab, tokenizer=tokenize_sentence)\n",
    "kw_model.extract_keywords(doc, vectorizer=vectorizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pymorphy3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "morph = pymorphy3.MorphAnalyzer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'почту россия'"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "morph.parse('почту')[0].normal_form"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['почта', 'россия', 'трекинг']"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenize_sentence('Почта России? трекинг')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}