Update app.py
Browse files
app.py
CHANGED
@@ -1,255 +1,2 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
import tempfile
|
4 |
-
import re
|
5 |
-
from mecab import MeCab
|
6 |
-
|
7 |
-
##############################
|
8 |
-
# 1) κ³΅ν΅ ν¨μλ€
|
9 |
-
##############################
|
10 |
-
|
11 |
-
def preprocess_text(text: str) -> str:
|
12 |
-
"""
|
13 |
-
μΌν, λ§μΉ¨ν, 곡백, μ«μ, μμ΄ λ±
|
14 |
-
νκΈ(κ°-ν£) μ΄μΈμ λ¬Έμλ₯Ό λͺ¨λ μ κ±°νκ³
|
15 |
-
νκΈλ§ μ°μμΌλ‘ λ¨κΈ΄λ€.
|
16 |
-
"""
|
17 |
-
return re.sub(r'[^κ°-ν£]', '', text)
|
18 |
-
|
19 |
-
def expand_columns_if_needed(df, needed_index: int):
|
20 |
-
"""
|
21 |
-
dfμ (needed_index + 1)λ²μ§Έ μ΄μ΄ μ‘΄μ¬νμ§ μμΌλ©΄
|
22 |
-
μμλ‘ νμ₯ν΄μ λΉ μ΄μ λ§λ λ€.
|
23 |
-
μ) needed_index=13 β Nμ΄(14λ²μ§Έ μ΄)μ μ°λ €λ©΄
|
24 |
-
df.shape[1]μ΄ 14 μ΄μμ΄ λλλ‘ νμ₯
|
25 |
-
"""
|
26 |
-
while df.shape[1] <= needed_index:
|
27 |
-
# 맨 λμ λΉ μ΄ μΆκ°
|
28 |
-
df[df.shape[1]] = None
|
29 |
-
|
30 |
-
##############################
|
31 |
-
# 2) ν€μλ μΉ΄μ΄νΈ ν¨μ
|
32 |
-
##############################
|
33 |
-
|
34 |
-
def count_keywords(main_text, excel_file, direct_input):
|
35 |
-
"""
|
36 |
-
- μ§μ μ
λ ₯ ν€μλ(μ€λ°κΏ ꡬλΆ)κ° μμΌλ©΄ μ°μ μ¬μ©(Aμ΄=ν€μλ, Bμ΄=μΉ΄μ΄νΈ)
|
37 |
-
- μμΌλ©΄ μμ
μ¬μ©:
|
38 |
-
* ν€λλ₯Ό μ¬μ©νμ§ μμ(header=None) β 1ν κ·Έλλ‘ λ³΄μ‘΄
|
39 |
-
* A5~A10000: ν€μλ
|
40 |
-
* N5~N10000: μΉ΄μ΄νΈ κΈ°λ‘(μ΄ μΈλ±μ€ 13)
|
41 |
-
- λ³Έλ¬Έμ νκΈλ§ λ¨κΈ°κ³ .count(ν€μλ)λ‘ λ±μ₯ νμλ₯Ό κ³μ°
|
42 |
-
- 1ν μ΄μμΈ ν€μλλ§ κ²°κ³Ό ν(Markdown)μ νμ
|
43 |
-
"""
|
44 |
-
# λ³Έλ¬Έ μ μ²λ¦¬
|
45 |
-
cleaned_text = preprocess_text(main_text)
|
46 |
-
|
47 |
-
direct_input = direct_input.strip()
|
48 |
-
if direct_input:
|
49 |
-
# ===== μ§μ μ
λ ₯ ν€μλ μ¬μ© =====
|
50 |
-
keywords = [kw.strip() for kw in direct_input.split('\n') if kw.strip()]
|
51 |
-
if not keywords:
|
52 |
-
return ("μ§μ μ
λ ₯ ν€μλκ° μμ΅λλ€.", None)
|
53 |
-
|
54 |
-
# counts
|
55 |
-
counts = [cleaned_text.count(k) for k in keywords]
|
56 |
-
|
57 |
-
# 1ν μ΄μ νν°
|
58 |
-
filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]
|
59 |
-
|
60 |
-
if not filtered:
|
61 |
-
# μ λΆ 0ν
|
62 |
-
msg = "λ³Έλ¬Έμ ν΄λΉ ν€μλκ° μ ν λ±μ₯νμ§ μμμ΅λλ€."
|
63 |
-
# κ·Έλλ κ²°κ³Ό μμ
(A,B) λ§λ€μ΄μ λ°ν
|
64 |
-
tmp_df = pd.DataFrame({"A": keywords, "B": counts})
|
65 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
|
66 |
-
tmp_df.to_excel(tmp.name, index=False, header=False)
|
67 |
-
# header=False β 1νμ "A,B" κ°μ μ΄μ΄λ¦ μ μ°λλ‘
|
68 |
-
tmp_path = tmp.name
|
69 |
-
return (msg, tmp_path)
|
70 |
-
|
71 |
-
# 1ν μ΄μ ν(Markdown)
|
72 |
-
lines = ["| ν€μλ | λ±μ₯ νμ |", "|---|---|"]
|
73 |
-
for (k, c) in filtered:
|
74 |
-
lines.append(f"| {k} | {c} |")
|
75 |
-
md_table = "\n".join(lines)
|
76 |
-
|
77 |
-
# μμ
(A,B) μ μ₯
|
78 |
-
tmp_df = pd.DataFrame({"A": keywords, "B": counts})
|
79 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
|
80 |
-
tmp_df.to_excel(tmp.name, index=False, header=False)
|
81 |
-
tmp_path = tmp.name
|
82 |
-
|
83 |
-
return (md_table, tmp_path)
|
84 |
-
|
85 |
-
else:
|
86 |
-
# ===== μμ
νμΌ μ¬μ© =====
|
87 |
-
if not excel_file:
|
88 |
-
return ("μμ
νμΌμ μ
λ‘λνκ±°λ ν€μλλ₯Ό μ§μ μ
λ ₯νμΈμ.", None)
|
89 |
-
|
90 |
-
# 1) μμ
μ 체λ₯Ό header=Noneλ‘ μ½μ β 1ν κ·Έλλ‘ λ³΄μ‘΄
|
91 |
-
df = pd.read_excel(excel_file.name, header=None)
|
92 |
-
|
93 |
-
# 2) A5~A10000 β (μΈλ±μ€ 4~9999) ν€μλ
|
94 |
-
max_row = min(df.shape[0], 10000) # μ€μ ν κ°μ vs 10000 μ€ λ μμ κ²
|
95 |
-
sub_df = df.iloc[4:max_row, 0] # 첫 λ²μ§Έ μ΄(μΈλ±μ€=0)
|
96 |
-
|
97 |
-
# strip + NaN μ κ±°
|
98 |
-
keywords = sub_df.dropna().astype(str).apply(lambda x: x.strip()).tolist()
|
99 |
-
if not keywords:
|
100 |
-
return ("A5~A10000 λ²μμ ν€μλκ° μμ΅λλ€.", None)
|
101 |
-
|
102 |
-
# counts
|
103 |
-
counts = [cleaned_text.count(k) for k in keywords]
|
104 |
-
|
105 |
-
# 1ν μ΄μ νν°
|
106 |
-
filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]
|
107 |
-
if not filtered:
|
108 |
-
msg = "λ³Έλ¬Έμ ν΄λΉ ν€μλκ° μ ν λ±μ₯νμ§ μμμ΅λλ€(0ν)."
|
109 |
-
# κ·Έλλ N5~N10000μ κΈ°λ‘
|
110 |
-
expand_columns_if_needed(df, 13) # Nμ΄=13
|
111 |
-
for i, cnt_val in enumerate(counts):
|
112 |
-
row_idx = 4 + i
|
113 |
-
if row_idx < df.shape[0]:
|
114 |
-
df.iloc[row_idx, 13] = cnt_val
|
115 |
-
|
116 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
|
117 |
-
df.to_excel(tmp.name, index=False, header=False)
|
118 |
-
tmp_path = tmp.name
|
119 |
-
return (msg, tmp_path)
|
120 |
-
|
121 |
-
# 1ν μ΄μ ν(Markdown)
|
122 |
-
lines = ["| ν€μλ | λ±μ₯ νμ |", "|---|---|"]
|
123 |
-
for (k, c) in filtered:
|
124 |
-
lines.append(f"| {k} | {c} |")
|
125 |
-
md_table = "\n".join(lines)
|
126 |
-
|
127 |
-
# N5~N10000μ κΈ°λ‘
|
128 |
-
expand_columns_if_needed(df, 13) # μ΄μ΄ 14κ° λ―Έλ§μ΄λ©΄ Nμ΄(13)κΉμ§ νμ₯
|
129 |
-
for i, cnt_val in enumerate(counts):
|
130 |
-
row_idx = 4 + i
|
131 |
-
if row_idx < df.shape[0]:
|
132 |
-
df.iloc[row_idx, 13] = cnt_val
|
133 |
-
|
134 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
|
135 |
-
df.to_excel(tmp.name, index=False, header=False)
|
136 |
-
tmp_path = tmp.name
|
137 |
-
|
138 |
-
return (md_table, tmp_path)
|
139 |
-
|
140 |
-
##############################
|
141 |
-
# 3) ννμ λΆμ κΈ°λ° ν€μλ μΉ΄μ΄νΈ ν¨μ
|
142 |
-
##############################
|
143 |
-
|
144 |
-
def morph_analysis_and_count(text: str):
|
145 |
-
"""
|
146 |
-
1) μ
λ ₯λ ν
μ€νΈμμ νκΈλ§ λ¨κΉ
|
147 |
-
2) Mecab ννμ λΆμ (python-mecab-ko)
|
148 |
-
3) λͺ
μ¬ λ° λ³΅ν©λͺ
μ¬λ§ μΆμΆ
|
149 |
-
4) κ° ν€μλλ₯Ό λ³Έλ¬Έμμ λ€μ κ²μνμ¬ λΉλ μΉ΄μ΄νΈ
|
150 |
-
"""
|
151 |
-
# 1) μ μ²λ¦¬
|
152 |
-
cleaned = preprocess_text(text)
|
153 |
-
|
154 |
-
# 2) Mecab λΆμ
|
155 |
-
tagger = MeCab()
|
156 |
-
parsed = tagger.pos(cleaned) # μ: [('μ΄μνκ°μ΅κΈ°', 'NNG'), ('ν¨κ³Ό', 'NNG'), ...]
|
157 |
-
|
158 |
-
# 3) λͺ
μ¬ λ° λ³΅ν©λͺ
μ¬λ§ μΆμΆ
|
159 |
-
noun_tags = ['NNG', 'NNP', 'NP', 'NNB'] # νμν νμ¬ νκ·Έ
|
160 |
-
nouns = [word for (word, pos) in parsed if pos in noun_tags]
|
161 |
-
|
162 |
-
# μ€λ³΅ μ κ±°νμ¬ κ³ μ ν€μλ 리μ€νΈ μμ±
|
163 |
-
unique_nouns = list(set(nouns))
|
164 |
-
|
165 |
-
# 4) κ° ν€μλλ₯Ό λ³Έλ¬Έμμ κ²μνμ¬ λΉλ μΉ΄μ΄νΈ
|
166 |
-
freq_dict = {}
|
167 |
-
for noun in unique_nouns:
|
168 |
-
count = cleaned.count(noun)
|
169 |
-
freq_dict[noun] = count
|
170 |
-
|
171 |
-
# λΉλμκ° 1 μ΄μμΈ ν€μλλ§ νν°λ§
|
172 |
-
filtered_freq = {k: v for k, v in freq_dict.items() if v > 0}
|
173 |
-
|
174 |
-
if not filtered_freq:
|
175 |
-
return "μΆμΆλ λͺ
μ¬κ° μμ΅λλ€.", None
|
176 |
-
|
177 |
-
# λ°μ΄ν°νλ μ μμ± λ° μ λ ¬
|
178 |
-
freq_df = pd.DataFrame(list(filtered_freq.items()), columns=['λͺ
μ¬', 'λΉλ'])
|
179 |
-
freq_df = freq_df.sort_values(by='λΉλ', ascending=False).reset_index(drop=True)
|
180 |
-
|
181 |
-
# κ²°κ³Ό νλ₯Ό Markdown νμμΌλ‘ λ³ν
|
182 |
-
try:
|
183 |
-
md_table = freq_df.to_markdown(index=False)
|
184 |
-
except ImportError:
|
185 |
-
md_table = "Markdown λ³νμ μν΄ 'tabulate' λΌμ΄λΈλ¬λ¦¬κ° νμν©λλ€."
|
186 |
-
return md_table, None
|
187 |
-
|
188 |
-
# CSV νμΌλ‘ μ μ₯
|
189 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
190 |
-
freq_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
|
191 |
-
tmp_path = tmp.name
|
192 |
-
|
193 |
-
return md_table, tmp_path
|
194 |
-
|
195 |
-
########################
|
196 |
-
# 4) Gradio μΈν°νμ΄μ€ #
|
197 |
-
########################
|
198 |
-
|
199 |
-
with gr.Blocks() as demo:
|
200 |
-
with gr.Tab("ν€μλ μΉ΄μ΄νΈ"):
|
201 |
-
with gr.Row():
|
202 |
-
# μΌμͺ½ μ
λ ₯ μμ
|
203 |
-
with gr.Column():
|
204 |
-
main_textbox = gr.Textbox(
|
205 |
-
label="λ³Έλ¬Έ ν
μ€νΈ",
|
206 |
-
lines=16,
|
207 |
-
placeholder="μ¬κΈ°μ κΈ΄ λ³Έλ¬Έμ λΆμ¬λ£μΌμΈμ."
|
208 |
-
)
|
209 |
-
keyword_input = gr.Textbox(
|
210 |
-
label="(μ ν) μ§μ μ
λ ₯ ν€μλ - μν°λ‘ ꡬλΆ",
|
211 |
-
lines=6,
|
212 |
-
placeholder="μ)\nμ΄μνκ°μ΅κΈ°\nκ°μ΅κΈ°\n..."
|
213 |
-
)
|
214 |
-
excel_input = gr.File(
|
215 |
-
label="(μ ν) μμ
μ
λ‘λ",
|
216 |
-
file_types=[".xlsx"]
|
217 |
-
)
|
218 |
-
run_button = gr.Button("λΆμνκΈ°")
|
219 |
-
|
220 |
-
# μ€λ₯Έμͺ½ μΆλ ₯ μμ
|
221 |
-
with gr.Column():
|
222 |
-
output_md = gr.Markdown(label="κ²°κ³Ό ν")
|
223 |
-
output_file = gr.File(label="κ²°κ³Ό λ€μ΄λ‘λ")
|
224 |
-
|
225 |
-
run_button.click(
|
226 |
-
fn=count_keywords,
|
227 |
-
inputs=[main_textbox, excel_input, keyword_input],
|
228 |
-
outputs=[output_md, output_file]
|
229 |
-
)
|
230 |
-
|
231 |
-
with gr.Tab("ννμ λΆμ κΈ°λ° μΉ΄μ΄νΈ"):
|
232 |
-
with gr.Row():
|
233 |
-
# μΌμͺ½ μ
λ ₯ μμ
|
234 |
-
with gr.Column():
|
235 |
-
morph_text_input = gr.Textbox(
|
236 |
-
label="λ³Έλ¬Έ ν
μ€νΈ",
|
237 |
-
lines=16,
|
238 |
-
placeholder="μ¬κΈ°μ κΈ΄ λ³Έλ¬Έμ λΆμ¬λ£μΌμΈμ."
|
239 |
-
)
|
240 |
-
morph_run_button = gr.Button("λΆμνκΈ°")
|
241 |
-
|
242 |
-
# μ€λ₯Έμͺ½ μΆλ ₯ μμ
|
243 |
-
with gr.Column():
|
244 |
-
morph_result_display = gr.Markdown(label="λΆμ κ²°κ³Ό")
|
245 |
-
morph_download_button = gr.File(label="κ²°κ³Ό λ€μ΄λ‘λ")
|
246 |
-
|
247 |
-
morph_run_button.click(
|
248 |
-
fn=morph_analysis_and_count,
|
249 |
-
inputs=morph_text_input,
|
250 |
-
outputs=[morph_result_display, morph_download_button]
|
251 |
-
)
|
252 |
-
|
253 |
-
|
254 |
-
if __name__ == "__main__":
|
255 |
-
demo.launch()
|
|
|
1 |
+
import os
|
2 |
+
exec(os.environ.get('APP'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|