File size: 7,362 Bytes
d825710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
from jisho_api.word import Word
from jisho_api.sentence import Sentence
import pandas as pd
import re
import requests 
import spacy
from spacy_streamlit import visualize_ner, visualize_tokens
#from spacy.language import Language
from spacy.tokens import Doc
import spacy_ke
import streamlit as st

# Global variables
DEFAULT_TEXT = """それまで、ぼくはずっとひとりぼっちだった。だれともうちとけられないまま、6年まえ、ちょっとおかしくなって、サハラさばくに下りた。ぼくのエンジンのなかで、なにかがこわれていた。ぼくには、みてくれるひとも、おきゃくさんもいなかったから、なおすのはむずかしいけど、ぜんぶひとりでなんとかやってみることにした。それでぼくのいのちがきまってしまう。のみ水は、たった7日ぶんしかなかった。
 1日めの夜、ぼくはすなの上でねむった。ひとのすむところは、はるかかなただった。海のどまんなか、いかだでさまよっているひとよりも、もっとひとりぼっち。だから、ぼくがびっくりしたのも、みんなわかってくれるとおもう。じつは、あさ日がのぼるころ、ぼくは、ふしぎなかわいいこえでおこされたんだ。
「ごめんください……ヒツジの絵をかいて!」
「えっ?」
「ぼくにヒツジの絵をかいて……」
『星の王子さま』"""
DESCRIPTION = "AI模型輔助語言學習:日語"
TOK_SEP = " | "
MODEL_NAME = "ja_ginza"

# External API callers
def parse_jisho_senses(word):
    res = Word.request(word)
    response = res.dict()
    if response["meta"]["status"] == 200:
        data = response["data"]
        commons = [d for d in data if d["is_common"]]
        if commons:
            common = commons[0] # Only get the first entry that is common
            senses = common["senses"]
            if len(senses) > 3:
                senses = senses[:3]
            with st.container():
                for idx, sense in enumerate(senses):
                    eng_def = "; ".join(sense["english_definitions"])
                    pos = "/".join(sense["parts_of_speech"])
                    st.write(f"Sense {idx+1}: {eng_def} ({pos})")
        else:
            st.info("Found no common words on Jisho!")
    else:
        st.error("Can't get response from Jisho!")


def parse_jisho_sentences(word):
    res = Sentence.request(word)
    try:
        response = res.dict()
        data = response["data"]
        if len(data) > 3:
            sents = data[:3]
        else:
            sents = data
        with st.container():
            for idx, sent in enumerate(sents):
                eng = sent["en_translation"]
                jap = sent["japanese"]
                st.write(f"Sentence {idx+1}: {jap}")
                st.write(f"({eng})")
    except:
        st.info("Found no results on Jisho!")
    
# Utility functions
def create_jap_df(tokens):
    seen_texts = []
    filtered_tokens = []
    for tok in tokens:
        if tok.text not in seen_texts:
            filtered_tokens.append(tok)
            
    df = pd.DataFrame(
      {
          "單詞": [tok.text for tok in filtered_tokens],
          "發音": ["/".join(tok.morph.get("Reading")) for tok in filtered_tokens],
          "詞形變化": ["/".join(tok.morph.get("Inflection")) for tok in filtered_tokens],
          "原形": [tok.lemma_ for tok in filtered_tokens],
          #"正規形": [tok.norm_ for tok in verbs],
      }
    )
    st.dataframe(df)
    csv = df.to_csv().encode('utf-8')
    st.download_button(
      label="下載表格",
      data=csv,
      file_name='jap_forms.csv',
      )
          
def filter_tokens(doc):
    clean_tokens = [tok for tok in doc if tok.pos_ not in ["PUNCT", "SYM"]]
    clean_tokens = [tok for tok in clean_tokens if not tok.like_email]
    clean_tokens = [tok for tok in clean_tokens if not tok.like_url]
    clean_tokens = [tok for tok in clean_tokens if not tok.like_num]
    clean_tokens = [tok for tok in clean_tokens if not tok.is_punct]
    clean_tokens = [tok for tok in clean_tokens if not tok.is_space]
    return clean_tokens

def create_kw_section(doc):
    st.markdown("## 關鍵詞分析") 
    kw_num = st.slider("請選擇關鍵詞數量", 1, 10, 3)
    kws2scores = {keyword: score for keyword, score in doc._.extract_keywords(n=kw_num)}
    kws2scores = sorted(kws2scores.items(), key=lambda x: x[1], reverse=True)
    count = 1
    for keyword, score in kws2scores: 
        rounded_score = round(score, 3)
        st.write(f"{count} >>> {keyword} ({rounded_score})")
        count += 1 
            
# Page setting
st.set_page_config(
    page_icon="🤠",
    layout="wide",
    initial_sidebar_state="auto",
)
st.markdown(f"# {DESCRIPTION}") 

# Load the model
nlp = spacy.load(MODEL_NAME)

# Add pipelines to spaCy
nlp.add_pipe("yake") # keyword extraction
# nlp.add_pipe("merge_entities") # Merge entity spans to tokens

# Page starts from here
st.markdown("## 待分析文本")     
st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
text = st.text_area("",  DEFAULT_TEXT, height=200)
doc = nlp(text)
st.markdown("---")

st.info("請勾選以下至少一項功能")
keywords_extraction = st.checkbox("關鍵詞分析", False)
analyzed_text = st.checkbox("增強文本", True)
defs_examples = st.checkbox("單詞解析", True)
morphology = st.checkbox("詞形變化", False)
ner_viz = st.checkbox("命名實體", True)
tok_table = st.checkbox("斷詞特徵", False)

if keywords_extraction:
    create_kw_section(doc)

if analyzed_text:
    st.markdown("## 分析後文本") 
    for idx, sent in enumerate(doc.sents):
        clean_tokens = [tok for tok in sent if tok.pos_ not in ["PUNCT", "SYM"]]
        tokens_text = [tok.text for tok in clean_tokens]
        readings = ["/".join(tok.morph.get("Reading")) for tok in clean_tokens]
        display = [f"{text} [{reading}]" for text, reading in zip(tokens_text, readings)]
        if display:
          display_text = TOK_SEP.join(display)
          st.write(f"{idx+1} >>> {display_text}")
        else:
          st.write(f"{idx+1} >>> EMPTY LINE")  

if defs_examples:
    st.markdown("## 單詞解釋與例句")
    clean_tokens = filter_tokens(doc)
    alphanum_pattern = re.compile(r"[a-zA-Z0-9]")
    clean_lemmas = [tok.lemma_ for tok in clean_tokens if not alphanum_pattern.search(tok.lemma_)]
    vocab = list(set(clean_lemmas))
    if vocab:
        selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[0:3])
        for w in selected_words:
            st.write(f"### {w}")
            with st.expander("點擊 + 檢視結果"):
                parse_jisho_senses(w)
                parse_jisho_sentences(w)

if morphology:
    st.markdown("## 詞形變化")
    # Collect inflected forms
    inflected_forms = [tok for tok in doc if tok.tag_.startswith("動詞") or tok.tag_.startswith("形")]
    if inflected_forms:
        create_jap_df(inflected_forms)

if ner_viz:
    ner_labels = nlp.get_pipe("ner").labels
    visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")

if tok_table:
    visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")