Spaces:
Sleeping
Sleeping
import MeCab | |
import re | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
class JapaneseTextVectorizer: | |
def __init__(self): | |
""" | |
MeCabのTaggerとTF-IDFベクトライザーを初期化 | |
""" | |
self.mecab_tagger = MeCab.Tagger() | |
self.tfidf_model = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b', norm=None) | |
self.vocab_list = [] | |
def _extract_nouns(self, text): | |
""" | |
テキストから名詞を抽出 | |
Parameters: | |
- text (str): 名詞を抽出する対象のテキスト | |
Returns: | |
- nouns (list): 抽出された名詞リスト | |
""" | |
node = self.mecab_tagger.parseToNode(text) | |
nouns = [] | |
while node: | |
word = node.surface | |
hinshi = node.feature.split(",")[0] | |
if hinshi == "名詞": | |
if (not word.isnumeric()) and (not re.match(r'^[\u3040-\u309F]+$', word)): | |
# 名詞が数値と平仮名のみの場合は除き、それ以外の名詞を保存 | |
nouns.append(word) | |
node = node.next | |
return nouns | |
def fit_transform(self, text): | |
""" | |
テキストをTF-IDF表現に変換 | |
Parameters: | |
- text (str): TF-IDF表現に変換する対象のテキスト | |
Returns: | |
- tfidf_dict (dict): 単語とそのTF-IDF値を格納した辞書 | |
""" | |
nouns = self._extract_nouns(text) | |
self.tfidf_model.fit(nouns) | |
vocab_text = " ".join(nouns) | |
tfidf_vec = self.tfidf_model.transform([vocab_text]).toarray()[0] | |
tfidf_dict = dict(zip(self.tfidf_model.get_feature_names_out(), tfidf_vec)) | |
tfidf_dict = {word: num_val for word, num_val in tfidf_dict.items() if num_val > 0} | |
# TF-IDF値で辞書をソートし、上位5つの要素を取得 | |
top_tfidf = dict(sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)[:5]) | |
return top_tfidf | |