ssboost commited on
Commit
ddf4061
Β·
verified Β·
1 Parent(s): 4a70618

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -255
app.py CHANGED
@@ -1,255 +1,2 @@
1
- import gradio as gr
2
- import pandas as pd
3
- import tempfile
4
- import re
5
- from mecab import MeCab
6
-
7
- ##############################
8
- # 1) 곡톡 ν•¨μˆ˜λ“€
9
- ##############################
10
-
11
- def preprocess_text(text: str) -> str:
12
- """
13
- μ‰Όν‘œ, λ§ˆμΉ¨ν‘œ, 곡백, 숫자, μ˜μ–΄ λ“±
14
- ν•œκΈ€(κ°€-힣) μ΄μ™Έμ˜ 문자λ₯Ό λͺ¨λ‘ μ œκ±°ν•˜κ³ 
15
- ν•œκΈ€λ§Œ μ—°μ†μœΌλ‘œ 남긴닀.
16
- """
17
- return re.sub(r'[^κ°€-힣]', '', text)
18
-
19
- def expand_columns_if_needed(df, needed_index: int):
20
- """
21
- df에 (needed_index + 1)번째 열이 μ‘΄μž¬ν•˜μ§€ μ•ŠμœΌλ©΄
22
- μž„μ‹œλ‘œ ν™•μž₯ν•΄μ„œ 빈 열을 λ§Œλ“ λ‹€.
23
- 예) needed_index=13 β†’ Nμ—΄(14번째 μ—΄)을 μ“°λ €λ©΄
24
- df.shape[1]이 14 이상이 λ˜λ„λ‘ ν™•μž₯
25
- """
26
- while df.shape[1] <= needed_index:
27
- # 맨 끝에 빈 μ—΄ μΆ”κ°€
28
- df[df.shape[1]] = None
29
-
30
- ##############################
31
- # 2) ν‚€μ›Œλ“œ 카운트 ν•¨μˆ˜
32
- ##############################
33
-
34
- def count_keywords(main_text, excel_file, direct_input):
35
- """
36
- - 직접 μž…λ ₯ ν‚€μ›Œλ“œ(μ€„λ°”κΏˆ ꡬ뢄)κ°€ 있으면 μš°μ„  μ‚¬μš©(Aμ—΄=ν‚€μ›Œλ“œ, Bμ—΄=카운트)
37
- - μ—†μœΌλ©΄ μ—‘μ…€ μ‚¬μš©:
38
- * 헀더λ₯Ό μ‚¬μš©ν•˜μ§€ μ•ŠμŒ(header=None) β†’ 1ν–‰ κ·ΈλŒ€λ‘œ 보쑴
39
- * A5~A10000: ν‚€μ›Œλ“œ
40
- * N5~N10000: 카운트 기둝(μ—΄ 인덱슀 13)
41
- - 본문은 ν•œκΈ€λ§Œ 남기고 .count(ν‚€μ›Œλ“œ)둜 λ“±μž₯ 횟수λ₯Ό 계산
42
- - 1회 이상인 ν‚€μ›Œλ“œλ§Œ κ²°κ³Ό ν‘œ(Markdown)에 ν‘œμ‹œ
43
- """
44
- # λ³Έλ¬Έ μ „μ²˜λ¦¬
45
- cleaned_text = preprocess_text(main_text)
46
-
47
- direct_input = direct_input.strip()
48
- if direct_input:
49
- # ===== 직접 μž…λ ₯ ν‚€μ›Œλ“œ μ‚¬μš© =====
50
- keywords = [kw.strip() for kw in direct_input.split('\n') if kw.strip()]
51
- if not keywords:
52
- return ("직접 μž…λ ₯ ν‚€μ›Œλ“œκ°€ μ—†μŠ΅λ‹ˆλ‹€.", None)
53
-
54
- # counts
55
- counts = [cleaned_text.count(k) for k in keywords]
56
-
57
- # 1회 이상 ν•„ν„°
58
- filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]
59
-
60
- if not filtered:
61
- # μ „λΆ€ 0회
62
- msg = "본문에 ν•΄λ‹Ή ν‚€μ›Œλ“œκ°€ μ „ν˜€ λ“±μž₯ν•˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€."
63
- # κ·Έλž˜λ„ κ²°κ³Ό μ—‘μ…€(A,B) λ§Œλ“€μ–΄μ„œ λ°˜ν™˜
64
- tmp_df = pd.DataFrame({"A": keywords, "B": counts})
65
- with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
66
- tmp_df.to_excel(tmp.name, index=False, header=False)
67
- # header=False β†’ 1행에 "A,B" 같은 열이름 μ•ˆ 쓰도둝
68
- tmp_path = tmp.name
69
- return (msg, tmp_path)
70
-
71
- # 1회 이상 ν‘œ(Markdown)
72
- lines = ["| ν‚€μ›Œλ“œ | λ“±μž₯ 횟수 |", "|---|---|"]
73
- for (k, c) in filtered:
74
- lines.append(f"| {k} | {c} |")
75
- md_table = "\n".join(lines)
76
-
77
- # μ—‘μ…€(A,B) μ €μž₯
78
- tmp_df = pd.DataFrame({"A": keywords, "B": counts})
79
- with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
80
- tmp_df.to_excel(tmp.name, index=False, header=False)
81
- tmp_path = tmp.name
82
-
83
- return (md_table, tmp_path)
84
-
85
- else:
86
- # ===== μ—‘μ…€ 파일 μ‚¬μš© =====
87
- if not excel_file:
88
- return ("μ—‘μ…€ νŒŒμΌμ„ μ—…λ‘œλ“œν•˜κ±°λ‚˜ ν‚€μ›Œλ“œλ₯Ό 직접 μž…λ ₯ν•˜μ„Έμš”.", None)
89
-
90
- # 1) μ—‘μ…€ 전체λ₯Ό header=None둜 읽음 β†’ 1ν–‰ κ·ΈλŒ€λ‘œ 보쑴
91
- df = pd.read_excel(excel_file.name, header=None)
92
-
93
- # 2) A5~A10000 β†’ (인덱슀 4~9999) ν‚€μ›Œλ“œ
94
- max_row = min(df.shape[0], 10000) # μ‹€μ œ ν–‰ 개수 vs 10000 쀑 더 μž‘μ€ 것
95
- sub_df = df.iloc[4:max_row, 0] # 첫 번째 μ—΄(인덱슀=0)
96
-
97
- # strip + NaN 제거
98
- keywords = sub_df.dropna().astype(str).apply(lambda x: x.strip()).tolist()
99
- if not keywords:
100
- return ("A5~A10000 λ²”μœ„μ— ν‚€μ›Œλ“œκ°€ μ—†μŠ΅λ‹ˆλ‹€.", None)
101
-
102
- # counts
103
- counts = [cleaned_text.count(k) for k in keywords]
104
-
105
- # 1회 이상 ν•„ν„°
106
- filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]
107
- if not filtered:
108
- msg = "본문에 ν•΄λ‹Ή ν‚€μ›Œλ“œκ°€ μ „ν˜€ λ“±μž₯ν•˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€(0회)."
109
- # κ·Έλž˜λ„ N5~N10000에 기둝
110
- expand_columns_if_needed(df, 13) # Nμ—΄=13
111
- for i, cnt_val in enumerate(counts):
112
- row_idx = 4 + i
113
- if row_idx < df.shape[0]:
114
- df.iloc[row_idx, 13] = cnt_val
115
-
116
- with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
117
- df.to_excel(tmp.name, index=False, header=False)
118
- tmp_path = tmp.name
119
- return (msg, tmp_path)
120
-
121
- # 1회 이상 ν‘œ(Markdown)
122
- lines = ["| ν‚€μ›Œλ“œ | λ“±μž₯ 횟수 |", "|---|---|"]
123
- for (k, c) in filtered:
124
- lines.append(f"| {k} | {c} |")
125
- md_table = "\n".join(lines)
126
-
127
- # N5~N10000에 기둝
128
- expand_columns_if_needed(df, 13) # 열이 14개 미만이면 Nμ—΄(13)κΉŒμ§€ ν™•μž₯
129
- for i, cnt_val in enumerate(counts):
130
- row_idx = 4 + i
131
- if row_idx < df.shape[0]:
132
- df.iloc[row_idx, 13] = cnt_val
133
-
134
- with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
135
- df.to_excel(tmp.name, index=False, header=False)
136
- tmp_path = tmp.name
137
-
138
- return (md_table, tmp_path)
139
-
140
- ##############################
141
- # 3) ν˜•νƒœμ†Œ 뢄석 기반 ν‚€μ›Œλ“œ 카운트 ν•¨μˆ˜
142
- ##############################
143
-
144
- def morph_analysis_and_count(text: str):
145
- """
146
- 1) μž…λ ₯된 ν…μŠ€νŠΈμ—μ„œ ν•œκΈ€λ§Œ 남김
147
- 2) Mecab ν˜•νƒœμ†Œ 뢄석 (python-mecab-ko)
148
- 3) λͺ…사 및 볡합λͺ…μ‚¬λ§Œ μΆ”μΆœ
149
- 4) 각 ν‚€μ›Œλ“œλ₯Ό λ³Έλ¬Έμ—μ„œ λ‹€μ‹œ κ²€μƒ‰ν•˜μ—¬ λΉˆλ„ 카운트
150
- """
151
- # 1) μ „μ²˜λ¦¬
152
- cleaned = preprocess_text(text)
153
-
154
- # 2) Mecab 뢄석
155
- tagger = MeCab()
156
- parsed = tagger.pos(cleaned) # 예: [('μ΄ˆμŒνŒŒκ°€μŠ΅κΈ°', 'NNG'), ('효과', 'NNG'), ...]
157
-
158
- # 3) λͺ…사 및 볡합λͺ…μ‚¬λ§Œ μΆ”μΆœ
159
- noun_tags = ['NNG', 'NNP', 'NP', 'NNB'] # ν•„μš”ν•œ ν’ˆμ‚¬ νƒœκ·Έ
160
- nouns = [word for (word, pos) in parsed if pos in noun_tags]
161
-
162
- # 쀑볡 μ œκ±°ν•˜μ—¬ 고유 ν‚€μ›Œλ“œ 리슀트 생성
163
- unique_nouns = list(set(nouns))
164
-
165
- # 4) 각 ν‚€μ›Œλ“œλ₯Ό λ³Έλ¬Έμ—μ„œ κ²€μƒ‰ν•˜μ—¬ λΉˆλ„ 카운트
166
- freq_dict = {}
167
- for noun in unique_nouns:
168
- count = cleaned.count(noun)
169
- freq_dict[noun] = count
170
-
171
- # λΉˆλ„μˆ˜κ°€ 1 이상인 ν‚€μ›Œλ“œλ§Œ 필터링
172
- filtered_freq = {k: v for k, v in freq_dict.items() if v > 0}
173
-
174
- if not filtered_freq:
175
- return "μΆ”μΆœλœ λͺ…사가 μ—†μŠ΅λ‹ˆλ‹€.", None
176
-
177
- # λ°μ΄ν„°ν”„λ ˆμž„ 생성 및 μ •λ ¬
178
- freq_df = pd.DataFrame(list(filtered_freq.items()), columns=['λͺ…사', 'λΉˆλ„'])
179
- freq_df = freq_df.sort_values(by='λΉˆλ„', ascending=False).reset_index(drop=True)
180
-
181
- # κ²°κ³Ό ν‘œλ₯Ό Markdown ν˜•μ‹μœΌλ‘œ λ³€ν™˜
182
- try:
183
- md_table = freq_df.to_markdown(index=False)
184
- except ImportError:
185
- md_table = "Markdown λ³€ν™˜μ„ μœ„ν•΄ 'tabulate' λΌμ΄λΈŒλŸ¬λ¦¬κ°€ ν•„μš”ν•©λ‹ˆλ‹€."
186
- return md_table, None
187
-
188
- # CSV 파일둜 μ €μž₯
189
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
190
- freq_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
191
- tmp_path = tmp.name
192
-
193
- return md_table, tmp_path
194
-
195
- ########################
196
- # 4) Gradio μΈν„°νŽ˜μ΄μŠ€ #
197
- ########################
198
-
199
- with gr.Blocks() as demo:
200
- with gr.Tab("ν‚€μ›Œλ“œ 카운트"):
201
- with gr.Row():
202
- # μ™Όμͺ½ μž…λ ₯ μ˜μ—­
203
- with gr.Column():
204
- main_textbox = gr.Textbox(
205
- label="λ³Έλ¬Έ ν…μŠ€νŠΈ",
206
- lines=16,
207
- placeholder="여기에 κΈ΄ 본문을 λΆ™μ—¬λ„£μœΌμ„Έμš”."
208
- )
209
- keyword_input = gr.Textbox(
210
- label="(선택) 직접 μž…λ ₯ ν‚€μ›Œλ“œ - μ—”ν„°λ‘œ ꡬ뢄",
211
- lines=6,
212
- placeholder="예)\nμ΄ˆμŒνŒŒκ°€μŠ΅κΈ°\nκ°€μŠ΅κΈ°\n..."
213
- )
214
- excel_input = gr.File(
215
- label="(선택) μ—‘μ…€ μ—…λ‘œλ“œ",
216
- file_types=[".xlsx"]
217
- )
218
- run_button = gr.Button("λΆ„μ„ν•˜κΈ°")
219
-
220
- # 였λ₯Έμͺ½ 좜λ ₯ μ˜μ—­
221
- with gr.Column():
222
- output_md = gr.Markdown(label="κ²°κ³Ό ν‘œ")
223
- output_file = gr.File(label="κ²°κ³Ό λ‹€μš΄λ‘œλ“œ")
224
-
225
- run_button.click(
226
- fn=count_keywords,
227
- inputs=[main_textbox, excel_input, keyword_input],
228
- outputs=[output_md, output_file]
229
- )
230
-
231
- with gr.Tab("ν˜•νƒœμ†Œ 뢄석 기반 카운트"):
232
- with gr.Row():
233
- # μ™Όμͺ½ μž…λ ₯ μ˜μ—­
234
- with gr.Column():
235
- morph_text_input = gr.Textbox(
236
- label="λ³Έλ¬Έ ν…μŠ€νŠΈ",
237
- lines=16,
238
- placeholder="여기에 κΈ΄ 본문을 λΆ™μ—¬λ„£μœΌμ„Έμš”."
239
- )
240
- morph_run_button = gr.Button("λΆ„μ„ν•˜κΈ°")
241
-
242
- # 였λ₯Έμͺ½ 좜λ ₯ μ˜μ—­
243
- with gr.Column():
244
- morph_result_display = gr.Markdown(label="뢄석 κ²°κ³Ό")
245
- morph_download_button = gr.File(label="κ²°κ³Ό λ‹€μš΄λ‘œλ“œ")
246
-
247
- morph_run_button.click(
248
- fn=morph_analysis_and_count,
249
- inputs=morph_text_input,
250
- outputs=[morph_result_display, morph_download_button]
251
- )
252
-
253
-
254
- if __name__ == "__main__":
255
- demo.launch()
 
1
+ import os
2
+ exec(os.environ.get('APP'))