{ "cells": [ { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "doc = 'как подключить модуль почту россии трекинг'" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "from keybert import KeyBERT\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "import spacy\n", "nlp = spacy.load(\"ru_core_news_sm\", exclude=['tokenizer', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])\n", "kw_model = KeyBERT(model=nlp)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.punctuation" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "import string\n", "\n", "def tokenize_sentence(text):\n", " # remove punctuation\n", " text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))\n", " # tokenize\n", " return [morph.parse(word)[0].normal_form for word in text.split()]" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "почта россии\n", "почта\n", "почта россии трекинг\n" ] } ], "source": [ "vocab_raw = [\n", " 'почта россии', 'почта', 'почта россии трекинг',\n", " 'яндекс доставка', 'яндекс доставка экспресс', 'яндекс доставка express',\n", " 'альфабанк', 'альфа банк',\n", "]\n", "aliases = [\n", " #('canonical name', ['aliases', ...])\n", " ('почта россии', ['почта']),\n", " ('яндекс доставка экспресс', ['яндекс доставка express']),\n", " ('альфабанк', ['альфа банк']),\n", "]\n", "vocab = [\" \".join(tokenize_sentence(s)) for s in vocab_raw]" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "как подключить модуль почту россии трекинг\n", "как подключить модуль почту россии трекинг\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "[('почта россия трекинг', 0.4786), ('почта россия', 0.3053), ('почта', 0.2357)]" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from keyphrase_vectorizers import KeyphraseCountVectorizer\n", "#vectorizer = KeyphraseCountVectorizer(spacy_pipeline='ru_core_news_sm', vocabulary=vocab)\n", "vectorizer = CountVectorizer(ngram_range=(1, 4), vocabulary=vocab, tokenizer=tokenize_sentence)\n", "kw_model.extract_keywords(doc, vectorizer=vectorizer)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "import pymorphy3" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "morph = pymorphy3.MorphAnalyzer()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'почту россия'" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "morph.parse('почту')[0].normal_form" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['почта', 'россия', 'трекинг']" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenize_sentence('Почта России? трекинг')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }