Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,11 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
import tempfile
|
4 |
import re
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
def preprocess_text(text: str) -> str:
|
7 |
"""
|
@@ -22,6 +27,9 @@ def expand_columns_if_needed(df, needed_index: int):
|
|
22 |
# ๋งจ ๋์ ๋น ์ด ์ถ๊ฐ
|
23 |
df[df.shape[1]] = None
|
24 |
|
|
|
|
|
|
|
25 |
|
26 |
def count_keywords(main_text, excel_file, direct_input):
|
27 |
"""
|
@@ -129,40 +137,119 @@ def count_keywords(main_text, excel_file, direct_input):
|
|
129 |
|
130 |
return (md_table, tmp_path)
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
########################
|
134 |
-
#
|
135 |
########################
|
136 |
-
with gr.Blocks() as demo:
|
137 |
-
gr.Markdown("## ๋ณธ๋ฌธ & ํค์๋ ๋ถ์ - (A5~A10000, N5~N10000)")
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
if __name__ == "__main__":
|
168 |
demo.launch()
|
|
|
2 |
import pandas as pd
|
3 |
import tempfile
|
4 |
import re
|
5 |
+
from mecab import MeCab
|
6 |
+
|
7 |
+
##############################
|
8 |
+
# 1) ๊ณตํต ํจ์๋ค
|
9 |
+
##############################
|
10 |
|
11 |
def preprocess_text(text: str) -> str:
|
12 |
"""
|
|
|
27 |
# ๋งจ ๋์ ๋น ์ด ์ถ๊ฐ
|
28 |
df[df.shape[1]] = None
|
29 |
|
30 |
+
##############################
|
31 |
+
# 2) ํค์๋ ์นด์ดํธ ํจ์
|
32 |
+
##############################
|
33 |
|
34 |
def count_keywords(main_text, excel_file, direct_input):
|
35 |
"""
|
|
|
137 |
|
138 |
return (md_table, tmp_path)
|
139 |
|
140 |
+
##############################
|
141 |
+
# 3) ํํ์ ๋ถ์ ๊ธฐ๋ฐ ํค์๋ ์นด์ดํธ ํจ์
|
142 |
+
##############################
|
143 |
+
|
144 |
+
def morph_analysis_and_count(text: str):
|
145 |
+
"""
|
146 |
+
1) ์
๋ ฅ๋ ํ
์คํธ์์ ํ๊ธ๋ง ๋จ๊น
|
147 |
+
2) Mecab ํํ์ ๋ถ์ (python-mecab-ko)
|
148 |
+
3) ๋ช
์ฌ ๋ฐ ๋ณตํฉ๋ช
์ฌ๋ง ์ถ์ถ
|
149 |
+
4) ๊ฐ ํค์๋๋ฅผ ๋ณธ๋ฌธ์์ ๋ค์ ๊ฒ์ํ์ฌ ๋น๋ ์นด์ดํธ
|
150 |
+
"""
|
151 |
+
# 1) ์ ์ฒ๋ฆฌ
|
152 |
+
cleaned = preprocess_text(text)
|
153 |
+
|
154 |
+
# 2) Mecab ๋ถ์
|
155 |
+
tagger = MeCab()
|
156 |
+
parsed = tagger.pos(cleaned) # ์: [('์ด์ํ๊ฐ์ต๊ธฐ', 'NNG'), ('ํจ๊ณผ', 'NNG'), ...]
|
157 |
+
|
158 |
+
# 3) ๋ช
์ฌ ๋ฐ ๋ณตํฉ๋ช
์ฌ๋ง ์ถ์ถ
|
159 |
+
noun_tags = ['NNG', 'NNP', 'NP', 'NNB'] # ํ์ํ ํ์ฌ ํ๊ทธ
|
160 |
+
nouns = [word for (word, pos) in parsed if pos in noun_tags]
|
161 |
+
|
162 |
+
# ์ค๋ณต ์ ๊ฑฐํ์ฌ ๊ณ ์ ํค์๋ ๋ฆฌ์คํธ ์์ฑ
|
163 |
+
unique_nouns = list(set(nouns))
|
164 |
+
|
165 |
+
# 4) ๊ฐ ํค์๋๋ฅผ ๋ณธ๋ฌธ์์ ๊ฒ์ํ์ฌ ๋น๋ ์นด์ดํธ
|
166 |
+
freq_dict = {}
|
167 |
+
for noun in unique_nouns:
|
168 |
+
count = cleaned.count(noun)
|
169 |
+
freq_dict[noun] = count
|
170 |
+
|
171 |
+
# ๋น๋์๊ฐ 1 ์ด์์ธ ํค์๋๋ง ํํฐ๋ง
|
172 |
+
filtered_freq = {k: v for k, v in freq_dict.items() if v > 0}
|
173 |
+
|
174 |
+
if not filtered_freq:
|
175 |
+
return "์ถ์ถ๋ ๋ช
์ฌ๊ฐ ์์ต๋๋ค.", None
|
176 |
+
|
177 |
+
# ๋ฐ์ดํฐํ๋ ์ ์์ฑ ๋ฐ ์ ๋ ฌ
|
178 |
+
freq_df = pd.DataFrame(list(filtered_freq.items()), columns=['๋ช
์ฌ', '๋น๋'])
|
179 |
+
freq_df = freq_df.sort_values(by='๋น๋', ascending=False).reset_index(drop=True)
|
180 |
+
|
181 |
+
# ๊ฒฐ๊ณผ ํ๋ฅผ Markdown ํ์์ผ๋ก ๋ณํ
|
182 |
+
try:
|
183 |
+
md_table = freq_df.to_markdown(index=False)
|
184 |
+
except ImportError:
|
185 |
+
md_table = "Markdown ๋ณํ์ ์ํด 'tabulate' ๋ผ์ด๋ธ๋ฌ๋ฆฌ๊ฐ ํ์ํฉ๋๋ค."
|
186 |
+
return md_table, None
|
187 |
+
|
188 |
+
# CSV ํ์ผ๋ก ์ ์ฅ
|
189 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
190 |
+
freq_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
|
191 |
+
tmp_path = tmp.name
|
192 |
+
|
193 |
+
return md_table, tmp_path
|
194 |
|
195 |
########################
|
196 |
+
# 4) Gradio ์ธํฐํ์ด์ค #
|
197 |
########################
|
|
|
|
|
198 |
|
199 |
+
with gr.Blocks() as demo:
|
200 |
+
with gr.Tab("ํค์๋ ์นด์ดํธ"):
|
201 |
+
with gr.Row():
|
202 |
+
# ์ผ์ชฝ ์
๋ ฅ ์์ญ
|
203 |
+
with gr.Column():
|
204 |
+
main_textbox = gr.Textbox(
|
205 |
+
label="๋ณธ๋ฌธ ํ
์คํธ",
|
206 |
+
lines=16,
|
207 |
+
placeholder="์ฌ๊ธฐ์ ๊ธด ๋ณธ๋ฌธ์ ๋ถ์ฌ๋ฃ์ผ์ธ์."
|
208 |
+
)
|
209 |
+
keyword_input = gr.Textbox(
|
210 |
+
label="(์ ํ) ์ง์ ์
๋ ฅ ํค์๋ - ์ํฐ๋ก ๊ตฌ๋ถ",
|
211 |
+
lines=6,
|
212 |
+
placeholder="์)\n์ด์ํ๊ฐ์ต๊ธฐ\n๊ฐ์ต๊ธฐ\n..."
|
213 |
+
)
|
214 |
+
excel_input = gr.File(
|
215 |
+
label="(์ ํ) ์์
์
๋ก๋",
|
216 |
+
file_types=[".xlsx"]
|
217 |
+
)
|
218 |
+
run_button = gr.Button("๋ถ์ํ๊ธฐ")
|
219 |
+
|
220 |
+
# ์ค๋ฅธ์ชฝ ์ถ๋ ฅ ์์ญ
|
221 |
+
with gr.Column():
|
222 |
+
output_md = gr.Markdown(label="๊ฒฐ๊ณผ ํ")
|
223 |
+
output_file = gr.File(label="๊ฒฐ๊ณผ ๋ค์ด๋ก๋")
|
224 |
+
|
225 |
+
run_button.click(
|
226 |
+
fn=count_keywords,
|
227 |
+
inputs=[main_textbox, excel_input, keyword_input],
|
228 |
+
outputs=[output_md, output_file]
|
229 |
+
)
|
230 |
+
|
231 |
+
with gr.Tab("ํํ์ ๋ถ์ ๊ธฐ๋ฐ ์นด์ดํธ"):
|
232 |
+
with gr.Row():
|
233 |
+
# ์ผ์ชฝ ์
๋ ฅ ์์ญ
|
234 |
+
with gr.Column():
|
235 |
+
morph_text_input = gr.Textbox(
|
236 |
+
label="๋ณธ๋ฌธ ํ
์คํธ",
|
237 |
+
lines=16,
|
238 |
+
placeholder="์ฌ๊ธฐ์ ๊ธด ๋ณธ๋ฌธ์ ๋ถ์ฌ๋ฃ์ผ์ธ์."
|
239 |
+
)
|
240 |
+
morph_run_button = gr.Button("๋ถ์ํ๊ธฐ")
|
241 |
+
|
242 |
+
# ์ค๋ฅธ์ชฝ ์ถ๋ ฅ ์์ญ
|
243 |
+
with gr.Column():
|
244 |
+
morph_result_display = gr.Markdown(label="๋ถ์ ๊ฒฐ๊ณผ")
|
245 |
+
morph_download_button = gr.File(label="๊ฒฐ๊ณผ ๋ค์ด๋ก๋")
|
246 |
+
|
247 |
+
morph_run_button.click(
|
248 |
+
fn=morph_analysis_and_count,
|
249 |
+
inputs=morph_text_input,
|
250 |
+
outputs=[morph_result_display, morph_download_button]
|
251 |
+
)
|
252 |
+
|
253 |
|
254 |
if __name__ == "__main__":
|
255 |
demo.launch()
|