dk-davidekim commited on
Commit
5515112
β€’
1 Parent(s): 7c4c56e

Delete pages

Browse files
Files changed (1) hide show
  1. pages/beta.py +0 -312
pages/beta.py DELETED
@@ -1,312 +0,0 @@
1
- import pandas as pd
2
- import requests
3
- import streamlit as st
4
- from streamlit_lottie import st_lottie
5
- import torch
6
- from transformers import AutoTokenizer, AutoModelForCausalLM
7
- import re
8
-
9
- # Page Config
10
- st.set_page_config(
11
- page_title="λ…Έλž˜ 가사 nν–‰μ‹œ Beta",
12
- page_icon="πŸ’Œ",
13
- layout="wide"
14
- )
15
- # st.text(os.listdir(os.curdir))
16
-
17
- ### Model
18
- tokenizer = AutoTokenizer.from_pretrained("wumusill/final_project_kogpt2")
19
-
20
- @st.cache(show_spinner=False)
21
- def load_model():
22
- model = AutoModelForCausalLM.from_pretrained("wumusill/final_project_kogpt2")
23
- return model
24
-
25
- model = load_model()
26
-
27
- @st.cache(show_spinner=False)
28
- def get_word():
29
- word = pd.read_csv("ballad_word.csv", encoding="cp949")
30
- return word
31
-
32
-
33
- word = get_word()
34
-
35
-
36
- one = word[word["0"].str.startswith("ν•œ")].sample(1).values[0][0]
37
- # st.header(type(one))
38
- # st.header(one)
39
-
40
-
41
- # Class : Dict 쀑볡 ν‚€ 좜λ ₯
42
- class poem(object):
43
- def __init__(self,letter):
44
- self.letter = letter
45
-
46
- def __str__(self):
47
- return self.letter
48
-
49
- def __repr__(self):
50
- return "'"+self.letter+"'"
51
-
52
-
53
- def beta_poem(input_letter):
54
- # λ‘μŒ 법칙 사전
55
- dooeum = {"라":"λ‚˜", "락":"λ‚™", "λž€":"λ‚œ", "λž„":"λ‚ ", "람":"남", "랍":"λ‚©", "λž‘":"λ‚­",
56
- "래":"λ‚΄", "랭":"냉", "냑":"μ•½", "랡":"μ•½", "λƒ₯":"μ–‘", "λŸ‰":"μ–‘", "λ…€":"μ—¬",
57
- "λ €":"μ—¬", "녁":"μ—­", "λ ₯":"μ—­", "λ…„":"μ—°", "λ ¨":"μ—°", "λ…ˆ":"μ—΄", "λ ¬":"μ—΄",
58
- "념":"μ—Ό", "λ ΄":"μ—Ό", "λ ΅":"μ—½", "λ…•":"영", "λ Ή":"영", "λ…œ":"예", "λ‘€":"예",
59
- "둜":"λ…Έ", "둝":"λ…Ή", "λ‘ ":"λ…Ό", "λ‘±":"농", "λ’°":"λ‡Œ", "뇨":"μš”", "료":"μš”",
60
- "룑":"용", "루":"λˆ„", "뉴":"유", "λ₯˜":"유", "뉡":"윑", "λ₯™":"윑", "λ₯œ":"윀",
61
- "λ₯ ":"율", "λ₯­":"육", "λ₯΅":"λŠ‘", "름":"늠", "릉":"λŠ₯", "λ‹ˆ":"이", "리":"이",
62
- "λ¦°":'인', 'λ¦Ό':'μž„', '립':'μž…'}
63
- # 결과물을 담을 list
64
- res_l = []
65
- len_sequence = 0
66
-
67
- # ν•œ κΈ€μžμ”© μΈλ±μŠ€μ™€ ν•¨κ»˜ κ°€μ Έμ˜΄
68
- for idx, val in enumerate(input_letter):
69
- # λ‘μŒ 법칙 적용
70
- if val in dooeum.keys():
71
- val = dooeum[val]
72
-
73
- # λ°œλΌλ“œμ— μžˆλŠ” 단어 적용
74
- try:
75
- one = word[word["0"].str.startswith(val)].sample(1).values[0][0]
76
- # st.text(one)
77
- except:
78
- one = val
79
-
80
- # 쒀더 λ§€λ„λŸ¬μš΄ μ‚Όν–‰μ‹œλ₯Ό μœ„ν•΄ 이전 λ¬Έμž₯μ΄λž‘ ν˜„μž¬ 음절 μ—°κ²°
81
- # 이후 generate 된 λ¬Έμž₯μ—μ„œ 이전 λ¬Έμž₯에 λŒ€ν•œ 데이터 제거
82
- link_with_pre_sentence = (" ".join(res_l)+ " " + one + " " if idx != 0 else one).strip()
83
- # print(link_with_pre_sentence)
84
-
85
- # μ—°κ²°λœ λ¬Έμž₯을 인코딩
86
- input_ids = tokenizer.encode(link_with_pre_sentence, add_special_tokens=False, return_tensors="pt")
87
-
88
- # 인코딩 κ°’μœΌλ‘œ λ¬Έμž₯ 생성
89
- output_sequence = model.generate(
90
- input_ids=input_ids,
91
- do_sample=True,
92
- max_length=42,
93
- min_length=len_sequence + 2,
94
- temperature=0.9,
95
- repetition_penalty=1.5,
96
- no_repeat_ngram_size=2)
97
-
98
- # μƒμ„±λœ λ¬Έμž₯ 리슀트둜 λ³€ν™˜ (인코딩 λ˜μ–΄μžˆκ³ , μƒμ„±λœ λ¬Έμž₯ λ’€λ‘œ padding 이 μžˆλŠ” μƒνƒœ)
99
- generated_sequence = output_sequence.tolist()[0]
100
-
101
- # padding index μ•žκΉŒμ§€ slicing ν•¨μœΌλ‘œμ¨ padding 제거, padding이 없을 μˆ˜λ„ 있기 λ•Œλ¬Έμ— 쑰건문 확인 ν›„ 제거
102
- # μ‚¬μš©ν•  generated_sequence κ°€ 5보닀 짧으면 κ°•μ œμ μœΌλ‘œ 길이λ₯Ό 8둜 ν•΄μ€€λ‹€...
103
- if tokenizer.pad_token_id in generated_sequence:
104
- check_index = generated_sequence.index(tokenizer.pad_token_id)
105
- check_index = check_index if check_index-len_sequence > 3 else len_sequence + 8
106
- generated_sequence = generated_sequence[:check_index]
107
-
108
- word_encode = tokenizer.encode(one, add_special_tokens=False, return_tensors="pt").tolist()[0][0]
109
- split_index = len(generated_sequence) - 1 - generated_sequence[::-1].index(word_encode)
110
-
111
- # 첫 κΈ€μžκ°€ μ•„λ‹ˆλΌλ©΄, generate 된 음절만 κ²°κ³Όλ¬Ό list에 λ“€μ–΄κ°ˆ 수 있게 μ•ž λ¬Έμž₯에 λŒ€ν•œ 인코딩 κ°’ 제거
112
- generated_sequence = generated_sequence[split_index:]
113
-
114
- # print(tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True))
115
- # λ‹€μŒ μŒμ ˆμ„ μœ„ν•΄ 길이 κ°±μ‹ 
116
- len_sequence += len([elem for elem in generated_sequence if elem not in(tokenizer.all_special_ids)])
117
- # κ²°κ³Όλ¬Ό λ””μ½”λ”©
118
- decoded_sequence = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
119
-
120
- # κ²°κ³Όλ¬Ό λ¦¬μŠ€νŠΈμ— λ‹΄κΈ°
121
- res_l.append(decoded_sequence)
122
-
123
- poem_dict = {"Type":"beta"}
124
-
125
- for letter, res in zip(input_letter, res_l):
126
- # decode_res = tokenizer.decode(res, clean_up_tokenization_spaces=True, skip_special_tokens=True)
127
- poem_dict[poem(letter)] = res
128
-
129
- return poem_dict
130
-
131
- def alpha_poem(input_letter):
132
-
133
- # λ‘μŒ 법칙 사전
134
- dooeum = {"라":"λ‚˜", "락":"λ‚™", "λž€":"λ‚œ", "λž„":"λ‚ ", "람":"남", "랍":"λ‚©", "λž‘":"λ‚­",
135
- "래":"λ‚΄", "랭":"냉", "냑":"μ•½", "랡":"μ•½", "λƒ₯":"μ–‘", "λŸ‰":"μ–‘", "λ…€":"μ—¬",
136
- "λ €":"μ—¬", "녁":"μ—­", "λ ₯":"μ—­", "λ…„":"μ—°", "λ ¨":"μ—°", "λ…ˆ":"μ—΄", "λ ¬":"μ—΄",
137
- "념":"μ—Ό", "λ ΄":"μ—Ό", "λ ΅":"μ—½", "λ…•":"영", "λ Ή":"영", "λ…œ":"예", "λ‘€":"예",
138
- "둜":"λ…Έ", "둝":"λ…Ή", "λ‘ ":"λ…Ό", "λ‘±":"농", "λ’°":"λ‡Œ", "뇨":"μš”", "료":"μš”",
139
- "룑":"용", "루":"λˆ„", "뉴":"유", "λ₯˜":"유", "뉡":"윑", "λ₯™":"윑", "λ₯œ":"윀",
140
- "λ₯ ":"율", "λ₯­":"육", "λ₯΅":"λŠ‘", "름":"늠", "릉":"λŠ₯", "λ‹ˆ":"이", "리":"이",
141
- "λ¦°":'인', 'λ¦Ό':'μž„', '립':'μž…'}
142
- # 결과물을 담을 list
143
- res_l = []
144
-
145
- # ν•œ κΈ€μžμ”© μΈλ±μŠ€μ™€ ν•¨κ»˜ κ°€μ Έμ˜΄
146
- for idx, val in enumerate(input_letter):
147
- # λ‘μŒ 법칙 적용
148
- if val in dooeum.keys():
149
- val = dooeum[val]
150
-
151
-
152
- while True:
153
- # λ§Œμ•½ idx κ°€ 0 이라면 == 첫 κΈ€μž
154
- if idx == 0:
155
- # 첫 κΈ€μž 인코딩
156
- input_ids = tokenizer.encode(
157
- val, add_special_tokens=False, return_tensors="pt")
158
- # print(f"{idx}번 인코딩 : {input_ids}\n") # 2차원 ν…μ„œ
159
-
160
- # 첫 κΈ€μž 인코딩 κ°’μœΌλ‘œ λ¬Έμž₯ 생성
161
- output_sequence = model.generate(
162
- input_ids=input_ids,
163
- do_sample=True,
164
- max_length=42,
165
- min_length=5,
166
- temperature=0.9,
167
- repetition_penalty=1.7,
168
- no_repeat_ngram_size=2)[0]
169
- # print("첫 κΈ€μž 인코딩 ν›„ generate κ²°κ³Ό:", output_sequence, "\n") # tensor
170
-
171
- # 첫 κΈ€μžκ°€ μ•„λ‹ˆλΌλ©΄
172
- else:
173
- # ν•œ 음절
174
- input_ids = tokenizer.encode(
175
- val, add_special_tokens=False, return_tensors="pt")
176
- # print(f"{idx}번 μ§Έ κΈ€μž 인코딩 : {input_ids} \n")
177
-
178
- # 쒀더 λ§€λ„λŸ¬μš΄ μ‚Όν–‰μ‹œλ₯Ό μœ„ν•΄ 이전 인코딩과 μ§€κΈˆ 인코딩 μ—°κ²°
179
- link_with_pre_sentence = torch.cat((generated_sequence, input_ids[0]), 0)
180
- link_with_pre_sentence = torch.reshape(link_with_pre_sentence, (1, len(link_with_pre_sentence)))
181
- # print(f"이전 ν…μ„œμ™€ μ—°κ²°λœ ν…μ„œ {link_with_pre_sentence} \n")
182
-
183
- # 인코딩 κ°’μœΌλ‘œ λ¬Έμž₯ 생성
184
- output_sequence = model.generate(
185
- input_ids=link_with_pre_sentence,
186
- do_sample=True,
187
- max_length=42,
188
- min_length=5,
189
- temperature=0.9,
190
- repetition_penalty=1.7,
191
- no_repeat_ngram_size=2)[0]
192
- # print(f"{idx}번 인코딩 ν›„ generate : {output_sequence}")
193
-
194
- # μƒμ„±λœ λ¬Έμž₯ 리슀트둜 λ³€ν™˜ (인코딩 λ˜μ–΄μžˆκ³ , μƒμ„±λœ λ¬Έμž₯ λ’€λ‘œ padding 이 μžˆλŠ” μƒνƒœ)
195
- generated_sequence = output_sequence.tolist()
196
- # print(f"{idx}번 인코딩 리슀트 : {generated_sequence} \n")
197
-
198
- # padding index μ•žκΉŒμ§€ slicing ν•¨μœΌλ‘œμ¨ padding 제거, padding이 없을 μˆ˜λ„ 있기 λ•Œλ¬Έμ— 쑰건문 확인 ν›„ 제거
199
- if tokenizer.pad_token_id in generated_sequence:
200
- generated_sequence = generated_sequence[:generated_sequence.index(tokenizer.pad_token_id)]
201
-
202
- generated_sequence = torch.tensor(generated_sequence)
203
- # print(f"{idx}번 인코딩 리슀트 νŒ¨λ”© 제거 ν›„ λ‹€μ‹œ ν…μ„œ : {generated_sequence} \n")
204
-
205
- # 첫 κΈ€μžκ°€ μ•„λ‹ˆλΌλ©΄, generate 된 음절만 κ²°κ³Όλ¬Ό list에 λ“€μ–΄κ°ˆ 수 있게 μ•ž λ¬Έμž₯에 λŒ€ν•œ 인코딩 κ°’ 제거
206
- # print(generated_sequence)
207
- if idx != 0:
208
- # 이전 λ¬Έμž₯의 길이 μ΄ν›„λ‘œ μŠ¬λΌμ΄μ‹±ν•΄μ„œ μ•ž λ¬Έμž₯ 제거
209
- generated_sequence = generated_sequence[len_sequence:]
210
-
211
- len_sequence = len(generated_sequence)
212
- # print("len_seq", len_sequence)
213
-
214
- # 음절 κ·ΈλŒ€λ‘œ λ±‰μœΌλ©΄ λ‹€μ‹œ 해와, μ•„λ‹ˆλ©΄ whileλ¬Έ νƒˆμΆœ
215
- if len_sequence > 1:
216
- break
217
-
218
- # κ²°κ³Όλ¬Ό λ¦¬μŠ€νŠΈμ— λ‹΄κΈ°
219
- res_l.append(generated_sequence)
220
-
221
- poem_dict = {"Type":"alpha"}
222
-
223
- for letter, res in zip(input_letter, res_l):
224
- decode_res = tokenizer.decode(res, clean_up_tokenization_spaces=True, skip_special_tokens=True)
225
- poem_dict[poem(letter)] = decode_res
226
-
227
- return poem_dict
228
-
229
- # Image(.gif)
230
- @st.cache(show_spinner=False)
231
- def load_lottieurl(url: str):
232
- r = requests.get(url)
233
- if r.status_code != 200:
234
- return None
235
- return r.json()
236
-
237
- lottie_url = "https://assets7.lottiefiles.com/private_files/lf30_fjln45y5.json"
238
-
239
- lottie_json = load_lottieurl(lottie_url)
240
- st_lottie(lottie_json, speed=1, height=200, key="initial")
241
-
242
-
243
- # Title
244
- row0_spacer1, row0_1, row0_spacer2, row0_2, row0_spacer3 = st.columns(
245
- (0.01, 2, 0.05, 0.5, 0.01)
246
- )
247
-
248
- with row0_1:
249
- st.markdown("# ν•œκΈ€ λ…Έλž˜ 가사 nν–‰μ‹œβœ")
250
- st.markdown("### πŸ¦λ©‹μŸμ΄μ‚¬μžμ²˜λŸΌ AIS7🦁 - νŒŒμ΄λ„ ν”„λ‘œμ νŠΈ")
251
-
252
- with row0_2:
253
- st.write("")
254
- st.write("")
255
- st.write("")
256
- st.subheader("1μ‘° - ν•΄νŒŒλ¦¬")
257
- st.write("μ΄μ§€ν˜œ, μ΅œμ§€μ˜, κΆŒμ†Œν¬, λ¬Έμ’…ν˜„, κ΅¬μžν˜„, κΉ€μ˜μ€€")
258
-
259
- st.write('---')
260
-
261
- # Explanation
262
- row1_spacer1, row1_1, row1_spacer2 = st.columns((0.01, 0.01, 0.01))
263
-
264
- with row1_1:
265
- st.markdown("### nν–‰μ‹œ κ°€μ΄λ“œλΌμΈ")
266
- st.markdown("1. ν•˜λ‹¨μ— μžˆλŠ” ν…μŠ€νŠΈλ°”μ— 5자 μ΄ν•˜ 단어λ₯Ό λ„£μ–΄μ£Όμ„Έμš”")
267
- st.markdown("2. 'nν–‰μ‹œ μ œμž‘ν•˜κΈ°' λ²„νŠΌμ„ ν΄λ¦­ν•΄μ£Όμ„Έμš”")
268
- st.markdown("* nν–‰μ‹œ νƒ€μž… μ„€μ •\n"
269
- " * Alpha ver. : λͺ¨λΈμ΄ 첫 μŒμ ˆλΆ€ν„° 생성\n"
270
- " * Beta ver. : 첫 μŒμ ˆμ„ λ°μ΄ν„°μ…‹μ—μ„œ μ°Ύκ³ , λ‹€μŒ 뢀뢄을 생성")
271
-
272
- st.write('---')
273
-
274
- # Model & Input
275
- row2_spacer1, row2_1, row2_spacer2= st.columns((0.01, 0.01, 0.01))
276
-
277
- col1, col2 = st.columns(2)
278
-
279
- # Word Input
280
- with row2_1:
281
-
282
- with col1:
283
- genre = st.radio(
284
- "nν–‰μ‹œ νƒ€μž… 선택",
285
- ('Alpha', 'Beta(test쀑)'))
286
-
287
- if genre == 'Alpha':
288
- n_line_poem = alpha_poem
289
-
290
- else:
291
- n_line_poem = beta_poem
292
-
293
- with col2:
294
- word_input = st.text_input(
295
- "nν–‰μ‹œμ— μ‚¬μš©ν•  단어λ₯Ό 적고 λ²„νŠΌμ„ λˆŒλŸ¬μ£Όμ„Έμš”.(μ΅œλŒ€ 5자) πŸ‘‡",
296
- placeholder='ν•œκΈ€ 단어λ₯Ό μž…λ ₯ν•΄μ£Όμ„Έμš”',
297
- max_chars=5
298
- )
299
- word_input = re.sub("[^κ°€-힣]", "", word_input)
300
-
301
- if st.button('nν–‰μ‹œ μ œμž‘ν•˜κΈ°'):
302
- if word_input == "":
303
- st.error("μ˜¨μ „ν•œ ν•œκΈ€ 단어λ₯Ό μ‚¬μš©ν•΄μ£Όμ„Έμš”!")
304
-
305
- else:
306
- st.write("nν–‰μ‹œ 단어 : ", word_input)
307
- with st.spinner('μž μ‹œ κΈ°λ‹€λ €μ£Όμ„Έμš”...'):
308
- result = n_line_poem(word_input)
309
- st.success('μ™„λ£ŒλμŠ΅λ‹ˆλ‹€!')
310
- for r in result:
311
- st.write(f'{r} : {result[r]}')
312
-