File size: 2,565 Bytes
453a744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import streamlit as st
import pandas as pd
import numpy as np
from fugashi import Tagger
import re
import random
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns
from gensim.corpora.dictionary import Dictionary
from gensim import  models
from gensim.models.word2vec import Word2Vec
from gensim import similarities

sns.set(font='IPAexGothic')

# モデルなどの読み込み
pattern = r"[^\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u20000-\u2ffff\sa-zA-Z]"  # 記号を取り除くための正規表現
df = pd.read_csv("./raw_corpus.csv")  # コーパス
dictionary = Dictionary.load("./livedoor.dict")  # 辞書
lda = models.ldamodel.LdaModel.load("./lda.model")  # トピックモデル
index = similarities.MatrixSimilarity.load("./lda.index")  # トピックモデルによるコーパスのindex
word_dist = lda.get_topics()  # トピックモデルの単語分布(K, V)
w2v = Word2Vec.load('./word2vec.gensim.model')  # 白やぎword2vec https://github.com/shiroyagicorp/japanese-word2vec-model-builder

# トピックモデルの情報(トピックごとの単語分布)のDataFrame化
num_words = 30
topic_list = []
word_list = []
weight_list = []
for n, values in lda.show_topics(num_topics=5, num_words=num_words, formatted=False): 
    for word, weight in values: 
        topic_list.append(n)
        word_list.append(word)
        weight_list.append(round(float(weight) * 100, 2))

topic_df = pd.DataFrame()
topic_df["topic"] = topic_list
topic_df["word"] = word_list
topic_df["weight"] = weight_list


st.sidebar.markdown("Set Parameter")

#word_list = ["任天堂", "バイオハザード", "ポケモン"]
#atom = st.sidebar.radio("元になる単語", word_list)
#negative = st.sidebar.radio("ー引く単語", word_list)
#positive = st.sidebar.radio("+足す単語", word_list)



st.header("word2vecによるアナロジー")
st.subheader("単語の演算")
st.caption("演算対象の単語")
col1, col2, col3 = st.columns(3)

with col1:
    #atom = st.text_input("元になる単語", atom)
    atom = st.text_input("元になる単語")

with col2:
    #negative = st.text_input("ー引く単語", negative)
    negative = st.text_input("ー引く単語")

with col3:
    #positive = st.text_input("+足す単語", positive)
    positive = st.text_input("+足す単語")

button = st.button("演算する")

if button:
    st.text(f"{atom} - {negative} + {positive}") 
    x = w2v.wv.most_similar(positive=[atom, positive], negative=negative)
    #st.text(f"{x}")
    st.dataframe(x)