dk-davidekim commited on
Commit
9e55db6
β€’
1 Parent(s): 9873fd5

Delete pages/beta.py

Browse files
Files changed (1) hide show
  1. pages/beta.py +0 -302
pages/beta.py DELETED
@@ -1,302 +0,0 @@
1
- import pandas as pd
2
- import requests
3
- import streamlit as st
4
- from streamlit_lottie import st_lottie
5
- import torch
6
- from transformers import AutoTokenizer, AutoModelForCausalLM
7
- import re
8
-
9
- # Page Config
10
- st.set_page_config(
11
- page_title="λ…Έλž˜ 가사 nν–‰μ‹œ Beta",
12
- page_icon="πŸ’Œ",
13
- layout="wide"
14
- )
15
- # st.text(os.listdir(os.curdir))
16
-
17
- ### Model
18
- tokenizer = AutoTokenizer.from_pretrained("wumusill/final_project_kogpt2")
19
-
20
- @st.cache(show_spinner=False)
21
- def load_model():
22
- model = AutoModelForCausalLM.from_pretrained("wumusill/final_project_kogpt2")
23
- return model
24
-
25
- model = load_model()
26
-
27
- word = pd.read_csv("ballad_word.csv", encoding="cp949")
28
- # st.dataframe(word)
29
-
30
- one = word[word["0"].str.startswith("ν•œ")].sample(1).values[0][0]
31
- # st.header(type(one))
32
- # st.header(one)
33
-
34
-
35
- # Class : Dict 쀑볡 ν‚€ 좜λ ₯
36
- class poem(object):
37
- def __init__(self,letter):
38
- self.letter = letter
39
-
40
- def __str__(self):
41
- return self.letter
42
-
43
- def __repr__(self):
44
- return "'"+self.letter+"'"
45
-
46
-
47
- def beta_poem(input_letter):
48
- # λ‘μŒ 법칙 사전
49
- dooeum = {"라":"λ‚˜", "락":"λ‚™", "λž€":"λ‚œ", "λž„":"λ‚ ", "람":"남", "랍":"λ‚©", "λž‘":"λ‚­",
50
- "래":"λ‚΄", "랭":"냉", "냑":"μ•½", "랡":"μ•½", "λƒ₯":"μ–‘", "λŸ‰":"μ–‘", "λ…€":"μ—¬",
51
- "λ €":"μ—¬", "녁":"μ—­", "λ ₯":"μ—­", "λ…„":"μ—°", "λ ¨":"μ—°", "λ…ˆ":"μ—΄", "λ ¬":"μ—΄",
52
- "념":"μ—Ό", "λ ΄":"μ—Ό", "λ ΅":"μ—½", "λ…•":"영", "λ Ή":"영", "λ…œ":"예", "λ‘€":"예",
53
- "둜":"λ…Έ", "둝":"λ…Ή", "λ‘ ":"λ…Ό", "λ‘±":"농", "λ’°":"λ‡Œ", "뇨":"μš”", "료":"μš”",
54
- "룑":"용", "루":"λˆ„", "뉴":"유", "λ₯˜":"유", "뉡":"윑", "λ₯™":"윑", "λ₯œ":"윀",
55
- "λ₯ ":"율", "λ₯­":"육", "λ₯΅":"λŠ‘", "름":"늠", "릉":"λŠ₯", "λ‹ˆ":"이", "리":"이",
56
- "λ¦°":'인', 'λ¦Ό':'μž„', '립':'μž…'}
57
- # 결과물을 담을 list
58
- res_l = []
59
- len_sequence = 0
60
-
61
- # ν•œ κΈ€μžμ”© μΈλ±μŠ€μ™€ ν•¨κ»˜ κ°€μ Έμ˜΄
62
- for idx, val in enumerate(input_letter):
63
- # λ‘μŒ 법칙 적용
64
- if val in dooeum.keys():
65
- val = dooeum[val]
66
-
67
- # λ°œλΌλ“œμ— μžˆλŠ” 단어 적용
68
- try:
69
- word = words[words.str.startswith(val)].sample(1).values[0]
70
- except:
71
- word = val
72
-
73
- # 쒀더 λ§€λ„λŸ¬μš΄ μ‚Όν–‰μ‹œλ₯Ό μœ„ν•΄ 이전 λ¬Έμž₯μ΄λž‘ ν˜„μž¬ 음절 μ—°κ²°
74
- # 이후 generate 된 λ¬Έμž₯μ—μ„œ 이전 λ¬Έμž₯에 λŒ€ν•œ 데이터 제거
75
- link_with_pre_sentence = (" ".join(res_l)+ " " + word + " " if idx != 0 else word).strip()
76
- # print(link_with_pre_sentence)
77
-
78
- # μ—°κ²°λœ λ¬Έμž₯을 인코딩
79
- input_ids = tokenizer.encode(link_with_pre_sentence, add_special_tokens=False, return_tensors="pt")
80
-
81
- # 인코딩 κ°’μœΌλ‘œ λ¬Έμž₯ 생성
82
- output_sequence = model.generate(
83
- input_ids=input_ids,
84
- do_sample=True,
85
- max_length=42,
86
- min_length=len_sequence + 2,
87
- temperature=0.9,
88
- repetition_penalty=1.5,
89
- no_repeat_ngram_size=2)
90
-
91
- # μƒμ„±λœ λ¬Έμž₯ 리슀트둜 λ³€ν™˜ (인코딩 λ˜μ–΄μžˆκ³ , μƒμ„±λœ λ¬Έμž₯ λ’€λ‘œ padding 이 μžˆλŠ” μƒνƒœ)
92
- generated_sequence = output_sequence.tolist()[0]
93
-
94
- # padding index μ•žκΉŒμ§€ slicing ν•¨μœΌλ‘œμ¨ padding 제거, padding이 없을 μˆ˜λ„ 있기 λ•Œλ¬Έμ— 쑰건문 확인 ν›„ 제거
95
- # μ‚¬μš©ν•  generated_sequence κ°€ 5보닀 짧으면 κ°•μ œμ μœΌλ‘œ 길이λ₯Ό 8둜 ν•΄μ€€λ‹€...
96
- if tokenizer.pad_token_id in generated_sequence:
97
- check_index = generated_sequence.index(tokenizer.pad_token_id)
98
- check_index = check_index if check_index-len_sequence > 3 else len_sequence + 8
99
- generated_sequence = generated_sequence[:check_index]
100
-
101
- word_encode = tokenizer.encode(word, add_special_tokens=False, return_tensors="pt").tolist()[0][0]
102
- split_index = len(generated_sequence) - 1 - generated_sequence[::-1].index(word_encode)
103
-
104
- # 첫 κΈ€μžκ°€ μ•„λ‹ˆλΌλ©΄, generate 된 음절만 κ²°κ³Όλ¬Ό list에 λ“€μ–΄κ°ˆ 수 있게 μ•ž λ¬Έμž₯에 λŒ€ν•œ 인코딩 κ°’ 제거
105
- generated_sequence = generated_sequence[split_index:]
106
-
107
- # print(tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True))
108
- # λ‹€μŒ μŒμ ˆμ„ μœ„ν•΄ 길이 κ°±μ‹ 
109
- len_sequence += len([elem for elem in generated_sequence if elem not in(tokenizer.all_special_ids)])
110
- # κ²°κ³Όλ¬Ό λ””μ½”λ”©
111
- decoded_sequence = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
112
-
113
- # κ²°κ³Όλ¬Ό λ¦¬μŠ€νŠΈμ— λ‹΄κΈ°
114
- res_l.append(decoded_sequence)
115
-
116
- poem_dict = {"Type":"beta"}
117
-
118
- for letter, res in zip(input_letter, res_l):
119
- # decode_res = tokenizer.decode(res, clean_up_tokenization_spaces=True, skip_special_tokens=True)
120
- poem_dict[poem(letter)] = res
121
-
122
- return poem_dict
123
-
124
- def alpha_poem(input_letter):
125
-
126
- # λ‘μŒ 법칙 사전
127
- dooeum = {"라":"λ‚˜", "락":"λ‚™", "λž€":"λ‚œ", "λž„":"λ‚ ", "람":"남", "랍":"λ‚©", "λž‘":"λ‚­",
128
- "래":"λ‚΄", "랭":"냉", "냑":"μ•½", "랡":"μ•½", "λƒ₯":"μ–‘", "λŸ‰":"μ–‘", "λ…€":"μ—¬",
129
- "λ €":"μ—¬", "녁":"μ—­", "λ ₯":"μ—­", "λ…„":"μ—°", "λ ¨":"μ—°", "λ…ˆ":"μ—΄", "λ ¬":"μ—΄",
130
- "념":"μ—Ό", "λ ΄":"μ—Ό", "λ ΅":"μ—½", "λ…•":"영", "λ Ή":"영", "λ…œ":"예", "λ‘€":"예",
131
- "둜":"λ…Έ", "둝":"λ…Ή", "λ‘ ":"λ…Ό", "λ‘±":"농", "λ’°":"λ‡Œ", "뇨":"μš”", "료":"μš”",
132
- "룑":"용", "루":"λˆ„", "뉴":"유", "λ₯˜":"유", "뉡":"윑", "λ₯™":"윑", "λ₯œ":"윀",
133
- "λ₯ ":"율", "λ₯­":"육", "λ₯΅":"λŠ‘", "름":"늠", "릉":"λŠ₯", "λ‹ˆ":"이", "리":"이",
134
- "λ¦°":'인', 'λ¦Ό':'μž„', '립':'μž…'}
135
- # 결과물을 담을 list
136
- res_l = []
137
-
138
- # ν•œ κΈ€μžμ”© μΈλ±μŠ€μ™€ ν•¨κ»˜ κ°€μ Έμ˜΄
139
- for idx, val in enumerate(input_letter):
140
- # λ‘μŒ 법칙 적용
141
- if val in dooeum.keys():
142
- val = dooeum[val]
143
-
144
-
145
- while True:
146
- # λ§Œμ•½ idx κ°€ 0 이라면 == 첫 κΈ€μž
147
- if idx == 0:
148
- # 첫 κΈ€μž 인코딩
149
- input_ids = tokenizer.encode(
150
- val, add_special_tokens=False, return_tensors="pt")
151
- # print(f"{idx}번 인코딩 : {input_ids}\n") # 2차원 ν…μ„œ
152
-
153
- # 첫 κΈ€μž 인코딩 κ°’μœΌλ‘œ λ¬Έμž₯ 생성
154
- output_sequence = model.generate(
155
- input_ids=input_ids,
156
- do_sample=True,
157
- max_length=42,
158
- min_length=5,
159
- temperature=0.9,
160
- repetition_penalty=1.7,
161
- no_repeat_ngram_size=2)[0]
162
- # print("첫 κΈ€μž 인코딩 ν›„ generate κ²°κ³Ό:", output_sequence, "\n") # tensor
163
-
164
- # 첫 κΈ€μžκ°€ μ•„λ‹ˆλΌλ©΄
165
- else:
166
- # ν•œ 음절
167
- input_ids = tokenizer.encode(
168
- val, add_special_tokens=False, return_tensors="pt")
169
- # print(f"{idx}번 μ§Έ κΈ€μž 인코딩 : {input_ids} \n")
170
-
171
- # 쒀더 λ§€λ„λŸ¬μš΄ μ‚Όν–‰μ‹œλ₯Ό μœ„ν•΄ 이전 인코딩과 μ§€κΈˆ 인코딩 μ—°κ²°
172
- link_with_pre_sentence = torch.cat((generated_sequence, input_ids[0]), 0)
173
- link_with_pre_sentence = torch.reshape(link_with_pre_sentence, (1, len(link_with_pre_sentence)))
174
- # print(f"이전 ν…μ„œμ™€ μ—°κ²°λœ ν…μ„œ {link_with_pre_sentence} \n")
175
-
176
- # 인코딩 κ°’μœΌλ‘œ λ¬Έμž₯ 생성
177
- output_sequence = model.generate(
178
- input_ids=link_with_pre_sentence,
179
- do_sample=True,
180
- max_length=42,
181
- min_length=5,
182
- temperature=0.9,
183
- repetition_penalty=1.7,
184
- no_repeat_ngram_size=2)[0]
185
- # print(f"{idx}번 인코딩 ν›„ generate : {output_sequence}")
186
-
187
- # μƒμ„±λœ λ¬Έμž₯ 리슀트둜 λ³€ν™˜ (인코딩 λ˜μ–΄μžˆκ³ , μƒμ„±λœ λ¬Έμž₯ λ’€λ‘œ padding 이 μžˆλŠ” μƒνƒœ)
188
- generated_sequence = output_sequence.tolist()
189
- # print(f"{idx}번 인코딩 리슀트 : {generated_sequence} \n")
190
-
191
- # padding index μ•žκΉŒμ§€ slicing ν•¨μœΌλ‘œμ¨ padding 제거, padding이 없을 μˆ˜λ„ 있기 λ•Œλ¬Έμ— 쑰건문 확인 ν›„ 제거
192
- if tokenizer.pad_token_id in generated_sequence:
193
- generated_sequence = generated_sequence[:generated_sequence.index(tokenizer.pad_token_id)]
194
-
195
- generated_sequence = torch.tensor(generated_sequence)
196
- # print(f"{idx}번 인코딩 리슀트 νŒ¨λ”© 제거 ν›„ λ‹€μ‹œ ν…μ„œ : {generated_sequence} \n")
197
-
198
- # 첫 κΈ€μžκ°€ μ•„λ‹ˆλΌλ©΄, generate 된 음절만 κ²°κ³Όλ¬Ό list에 λ“€μ–΄κ°ˆ 수 있게 μ•ž λ¬Έμž₯에 λŒ€ν•œ 인코딩 κ°’ 제거
199
- # print(generated_sequence)
200
- if idx != 0:
201
- # 이전 λ¬Έμž₯의 길이 μ΄ν›„λ‘œ μŠ¬λΌμ΄μ‹±ν•΄μ„œ μ•ž λ¬Έμž₯ 제거
202
- generated_sequence = generated_sequence[len_sequence:]
203
-
204
- len_sequence = len(generated_sequence)
205
- # print("len_seq", len_sequence)
206
-
207
- # 음절 κ·ΈλŒ€λ‘œ λ±‰μœΌλ©΄ λ‹€μ‹œ 해와, μ•„λ‹ˆλ©΄ whileλ¬Έ νƒˆμΆœ
208
- if len_sequence > 1:
209
- break
210
-
211
- # κ²°κ³Όλ¬Ό λ¦¬μŠ€νŠΈμ— λ‹΄κΈ°
212
- res_l.append(generated_sequence)
213
-
214
- poem_dict = {"Type":"alpha"}
215
-
216
- for letter, res in zip(input_letter, res_l):
217
- decode_res = tokenizer.decode(res, clean_up_tokenization_spaces=True, skip_special_tokens=True)
218
- poem_dict[poem(letter)] = decode_res
219
-
220
- return poem_dict
221
-
222
- # Image(.gif)
223
- @st.cache(show_spinner=False)
224
- def load_lottieurl(url: str):
225
- r = requests.get(url)
226
- if r.status_code != 200:
227
- return None
228
- return r.json()
229
-
230
- lottie_url = "https://assets7.lottiefiles.com/private_files/lf30_fjln45y5.json"
231
-
232
- lottie_json = load_lottieurl(lottie_url)
233
- st_lottie(lottie_json, speed=1, height=200, key="initial")
234
-
235
-
236
- # Title
237
- row0_spacer1, row0_1, row0_spacer2, row0_2, row0_spacer3 = st.columns(
238
- (0.01, 2, 0.05, 0.5, 0.01)
239
- )
240
-
241
- with row0_1:
242
- st.markdown("# ν•œκΈ€ λ…Έλž˜ 가사 nν–‰μ‹œβœ")
243
- st.markdown("### πŸ¦λ©‹μŸμ΄μ‚¬μžμ²˜λŸΌ AIS7🦁 - νŒŒμ΄λ„ ν”„λ‘œμ νŠΈ")
244
-
245
- with row0_2:
246
- st.write("")
247
- st.write("")
248
- st.write("")
249
- st.subheader("1μ‘° - ν•΄νŒŒλ¦¬")
250
- st.write("μ΄μ§€ν˜œ, μ΅œμ§€μ˜, κΆŒμ†Œν¬, λ¬Έμ’…ν˜„, κ΅¬μžν˜„, κΉ€μ˜μ€€")
251
-
252
- st.write('---')
253
-
254
- # Explanation
255
- row1_spacer1, row1_1, row1_spacer2 = st.columns((0.01, 0.01, 0.01))
256
-
257
- with row1_1:
258
- st.markdown("### nν–‰μ‹œ κ°€μ΄λ“œλΌμΈ")
259
- st.markdown("1. ν•˜λ‹¨μ— μžˆλŠ” ν…μŠ€νŠΈλ°”μ— 5자 μ΄ν•˜ 단어λ₯Ό λ„£μ–΄μ£Όμ„Έμš”")
260
- st.markdown("2. 'nν–‰μ‹œ μ œμž‘ν•˜κΈ°' λ²„νŠΌμ„ ν΄λ¦­ν•΄μ£Όμ„Έμš”")
261
-
262
- st.write('---')
263
-
264
- # Model & Input
265
- row2_spacer1, row2_1, row2_spacer2= st.columns((0.01, 0.01, 0.01))
266
-
267
- col1, col2 = st.columns(2)
268
-
269
- # Word Input
270
- with row2_1:
271
-
272
- with col1:
273
- genre = st.radio(
274
- "nν–‰μ‹œ νƒ€μž… 선택",
275
- ('Alpha', 'Beta(test쀑)'))
276
-
277
- if genre == 'Alpha':
278
- n_line_poem = alpha_poem
279
-
280
- else:
281
- n_line_poem = beta_poem
282
-
283
- with col2:
284
- word_input = st.text_input(
285
- "nν–‰μ‹œμ— μ‚¬μš©ν•  단어λ₯Ό 적고 λ²„νŠΌμ„ λˆŒλŸ¬μ£Όμ„Έμš”.(μ΅œλŒ€ 5자) πŸ‘‡",
286
- placeholder='ν•œκΈ€ 단어λ₯Ό μž…λ ₯ν•΄μ£Όμ„Έμš”',
287
- max_chars=5
288
- )
289
- word_input = re.sub("[^κ°€-힣]", "", word_input)
290
-
291
- if st.button('nν–‰μ‹œ μ œμž‘ν•˜κΈ°'):
292
- if word_input == "":
293
- st.error("μ˜¨μ „ν•œ ν•œκΈ€ 단어λ₯Ό μ‚¬μš©ν•΄μ£Όμ„Έμš”!")
294
-
295
- else:
296
- st.write("nν–‰μ‹œ 단어 : ", word_input)
297
- with st.spinner('μž μ‹œ κΈ°λ‹€λ €μ£Όμ„Έμš”...'):
298
- result = n_line_poem(word_input)
299
- st.success('μ™„λ£ŒλμŠ΅λ‹ˆλ‹€!')
300
- for r in result:
301
- st.write(f'{r} : {result[r]}')
302
-