import streamlit as st import pandas as pd import numpy as np from fugashi import Tagger import re import random import matplotlib.pyplot as plt import japanize_matplotlib import seaborn as sns from gensim.corpora.dictionary import Dictionary from gensim import models from gensim.models.word2vec import Word2Vec from gensim import similarities sns.set(font='IPAexGothic') # モデルなどの読み込み pattern = r"[^\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u20000-\u2ffff\sa-zA-Z]" # 記号を取り除くための正規表現 df = pd.read_csv("./raw_corpus.csv") # コーパス dictionary = Dictionary.load("./livedoor.dict") # 辞書 lda = models.ldamodel.LdaModel.load("./lda.model") # トピックモデル index = similarities.MatrixSimilarity.load("./lda.index") # トピックモデルによるコーパスのindex word_dist = lda.get_topics() # トピックモデルの単語分布(K, V) w2v = Word2Vec.load('./word2vec.gensim.model') # 白やぎword2vec https://github.com/shiroyagicorp/japanese-word2vec-model-builder # トピックモデルの情報(トピックごとの単語分布)のDataFrame化 num_words = 30 topic_list = [] word_list = [] weight_list = [] for n, values in lda.show_topics(num_topics=5, num_words=num_words, formatted=False): for word, weight in values: topic_list.append(n) word_list.append(word) weight_list.append(round(float(weight) * 100, 2)) topic_df = pd.DataFrame() topic_df["topic"] = topic_list topic_df["word"] = word_list topic_df["weight"] = weight_list st.sidebar.markdown("Set Parameter") #word_list = ["任天堂", "バイオハザード", "ポケモン"] #atom = st.sidebar.radio("元になる単語", word_list) #negative = st.sidebar.radio("ー引く単語", word_list) #positive = st.sidebar.radio("+足す単語", word_list) st.header("word2vecによるアナロジー") st.subheader("単語の演算") st.caption("演算対象の単語") col1, col2, col3 = st.columns(3) with col1: #atom = st.text_input("元になる単語", atom) atom = st.text_input("元になる単語") with col2: #negative = st.text_input("ー引く単語", negative) negative = st.text_input("ー引く単語") with col3: #positive = st.text_input("+足す単語", positive) positive = st.text_input("+足す単語") button = st.button("演算する") if button: st.text(f"{atom} - {negative} + {positive}") x = w2v.wv.most_similar(positive=[atom, positive], negative=negative) #st.text(f"{x}") st.dataframe(x)