Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- app.py +174 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
import gradio as gr
|
3 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
4 |
+
from nltk.tokenize import sent_tokenize
|
5 |
+
from konlpy.tag import Kkma
|
6 |
+
import gc
|
7 |
+
|
8 |
+
import nltk
|
9 |
+
nltk.download('punkt')
|
10 |
+
|
11 |
+
# from PyKakao import KoGPT
|
12 |
+
# kogpt_api = KoGPT(service_key = "")
|
13 |
+
import openai
|
14 |
+
openai.api_key = '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'
|
15 |
+
gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
|
16 |
+
|
17 |
+
import os
|
18 |
+
# if not(os.environ['JAVA_HOME']):
|
19 |
+
# os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-17\bin\server'
|
20 |
+
|
21 |
+
#en2ko = 'alphahg/m2m100_418M-finetuned-en-to-ko-4770260'#'alphahg/mbart-large-50-finetuned-en-to-ko-8603428-finetuned-en-to-ko-9914408'
|
22 |
+
en2ko = 'alphahg/mbart-large-50-finetuned-en-to-ko-8603428-finetuned-en-to-ko-9914408'
|
23 |
+
ko2en = 'alphahg/opus-mt-ko-en-finetuned-ko-to-en-2780616'
|
24 |
+
ensum = 'allenai/led-large-16384-arxiv'
|
25 |
+
kosum = 'alphahg/pko-t5-small-finetuned-paper-4564652' #'lcw99/t5-base-korean-text-summary'
|
26 |
+
|
27 |
+
kkma = Kkma()
|
28 |
+
#en_pipe = pipeline('translation', model=en2ko, tokenizer=en2ko, src_lang = "en", tgt_lang = "ko", device_map="auto")
|
29 |
+
en2ko_model = AutoModelForSeq2SeqLM.from_pretrained(en2ko)
|
30 |
+
|
31 |
+
en_pipe = pipeline('translation', model=en2ko_model, tokenizer=en2ko, src_lang = "en_XX", tgt_lang = "ko_KR", device="cuda:0")
|
32 |
+
ko_pipe = pipeline('translation', model=ko2en, tokenizer=ko2en, device="cuda:0")
|
33 |
+
style_pipe = pipeline('translation', model=en2ko_model, tokenizer=en2ko, src_lang = "ko_KR", tgt_lang = "ko_KR", device="cuda:0")
|
34 |
+
|
35 |
+
en_sum = pipeline('summarization', model=ensum, tokenizer=ensum, device="cuda:1")
|
36 |
+
ko_sum = pipeline('summarization', model=kosum, tokenizer=kosum, device="cuda:1")
|
37 |
+
|
38 |
+
def len_tokens(text, pipe):
|
39 |
+
return len(pipe.tokenizer(text)['input_ids'])
|
40 |
+
|
41 |
+
def split_sent(sentences, pipe, max_len=256):
|
42 |
+
if not sentences:
|
43 |
+
return []
|
44 |
+
|
45 |
+
paragraphs = []
|
46 |
+
example = sentences[0]
|
47 |
+
for i in range(1, len(sentences)):
|
48 |
+
if len_tokens(example + ' ' + sentences[i], pipe) > max_len:
|
49 |
+
paragraphs.append(example)
|
50 |
+
example = sentences[i]
|
51 |
+
else:
|
52 |
+
example += ' ' + sentences[i]
|
53 |
+
|
54 |
+
paragraphs.append(example)
|
55 |
+
|
56 |
+
return paragraphs
|
57 |
+
|
58 |
+
# chatbot = Chatbot({
|
59 |
+
# #"session_token": "eyJhbGciOiJkaXIiLCJlbmMiOiJBMjU2R0NNIn0..hV_ujfbYLwBgI-g6.zQW0evUrpYfli2cujTFp1ie5PhthUZayoSY2Chb1Eb8Ow3t6l2-NUwGJcYyxVKQS0aITN3-ph-KzPysnu7dCF9KrC-22DZzs1zMFm3PHEkjb4jD69qndcFEGGH8y4SejfYwvdj4wfKmVnGo3XTNZ31qPSDA2PBoaOMpxWABSqWULMJbS-_Y--wd0YhsqMFlkCQpXyfxSf9yxXlPYt0HR_NgupoBXP-WVYbODSUYFVqa3IsScbSPS-mUY0YrAb8AJUkvej7HiSCN9onyThgTtZgZklwqpB4FesJaC6R3nSXSg5cKDDupGVBUQlPRiemacCV6tXnSC-bCtN9a-l9RRqtX_FJNP8T7Kb75ktuedKKrXXTmk3x7hz_RhYhZ4wXFkbqXexZQXTfQoI2vKlLN73EHBlJOqsDLnOP7zT4Vr2RpBbk1HK5D_uHh5x1X3aBHslHfEQpjjZKiMs0to9DwKzSNHXlNmCeGjT9ZzKVsWYCiseO20IlKxQ63Q_nIbi-6y8e6LWw9O82ESKkkRe8kN-CzGxakKJegmHKQGRPZu9ZEIqwWYqlnahVyWRFOtjfMNN3ncGbQGi54VMyfSqmSvPXecaxsVNzl00gHvmCFBFJBDXM2GTsvEzQsJi1MLyopXtSuiU1anL_kC1eMvew61vd3TtC97ZwlQLaWc6dT14p5NdJpN9ihFpgtxMP1rcQhNTc5fo9BBoKrO4yGOuy0wixJs6ORVdY2o3c653X8PFmrso1XaV3KpBaSmcvszIhL1anJTA8SpnPNRmksEFONcX8AfpQ-4WbckCS737TZYCDulVtvukyVAbtq9cEQP5kXXAGOWKg5lX4nRFynM83f8P-XGIg-XUGE99NRIOCBo28cr04fWFOaJOyHf9eP6Rx5zjNv1qwp4FxhVVP9jlmSTfu97CZSR91L-k8V6jVgbj8F6YUZ6iiu51kaOAqf5de4EUncSFyGLuJfCGTJTPSYYl1lnR6bSfTVHKwP28YzzcU2myMM5B0ZXDwydD900TYXZOCxxLPUbu5-G3roR2KZnuWLXFOiafAvDx-LHYUHSWQZ9ouWcDaQBNsXmfTZtIWHQ8aTZwlNEnN4-uFdlk2Lm35qp1v-8Fp_3aXGQ3CrTy-ryMV0rUPTSMCEA8gVA_mD40zV6Wcb4asc3zsYAuomQ3Iu4iB5wyWGxUIJVzl1C9QaPpAx7vp5u7w-0_rtocVVXFRTZ8aSxNS3QAd62TbVyToIOrsvp4kOWDcqhNp5QBAsJtES9pbO9fiy_SJS83SFMliSFd-jhXfKu0kUYIUb9yaN5QC6eEpgJ7KzhwTcNDtoqyBKMyVTSdUXA9P2Yv2e4r-BVnxlW0RxknQdesK-wZrwuAZt_bnLaHSqFzyWz5AE7pukTBQ2QdVoity_tVURzhTcINh6rvPywm2IVl1gC3FjfhQVfTvHWFtUzNqLN4yUfI0Tc1mGHQlYuxZ_yux2B8HeYb_cyb1rR_mwDiOs3PKOnhfNRdXqXf6RWr7KdjNc0k-CMm13DAYQggmFCmEZW20FiwalKqVq3nFTFDhfp5mxtr0sLCVxGA3eTqC6_i2TAVGqDLjxzfz5WiK7J3FAN2_kmEZLBVXHabwa9kKyCzcgCx6FrxaFidskO6t4dWu3wok95hXMae0Z4ZFs7HVNisM1pkRm7LE3XdvnslKHAJkPr57HsFdlQITJRSx3Tg0EN1LUt80hKx8VGXPv7zBeXP5lni2ixpglMQmiKLiszowGoqu2oJPwougueu5Bj4BLhmoqK8DCtdxl3MYAyxLWWStXQqcEJQw7koYmPNwr4BzI9cQVk81LbPwrXBbJfR7G14e5qV0lULfuU5qVfNt7DU6FbwXmzv6qFI-jOClLzSTKpFzp51wQQ5fh2REs6CPJlL-kiyomJPXcqSeezDCLLwWjI_vIyODFkzt91l-dmFriu3HMkMC1v29AJlfPA_avSiJzJDEI7rb6AbEyT6piqp1TYlWMkI_rJCsCZXIb10Rjd2y9sR-Dz3_FZzRvJUA7BfRlP7Bf04HnYsyMRoJilbuyQ5fB0B2L2nxjYY2zoHJ_x6HTS6tcrAijOO4FSSQngWD9iTKCm6pjW3aZjFyXyjmP82S3VnhEyON390aIL7j9Y0wGnHzOkn54OfyxxGeo2mFAIv9kthL_Fi8d9G_rvvQOBUM2a7kjF5-n8wby0YDujoRl0ETg379HyMVf7F2BHWQ8nAbICRxWZ7EzPLwzrVjPiQVPZklkrVYgEmGxZDrgEG8IeNi7FMgGruaQ1tENczRMXzaApK8k6-FXKhfFIV7dN95tP4k6tnnxRFoMAUWcXwQCzRH8YhID36TAUFdBQ-c52MTogPo1Rki1N49j_e7Mph1OABQX2Fw9-CukT6reQkp3nGrwi0IKnKoyGhHOBHK3kzQwINfjOBbNpOjP-6MX_9kiRTstN2GLte8w0QJQVl84o8ACTjV8N4rhI6xyLKIvqoyZ6jNO3SYs8fEutmZO8-qB0iksIiQHupxQgcmgbAyM.KxAbJWqMvwm0PYtq7MuR6A"
|
60 |
+
|
61 |
+
# }, conversation_id=None, parent_id=None) # You can start a custom conversation
|
62 |
+
# %%
|
63 |
+
def translate(text, lang, gpt_fix=False):
|
64 |
+
from_en = False if lang == 'νμ' else True
|
65 |
+
sentences = sent_tokenize(text) if from_en else kkma.sentences(text)
|
66 |
+
#print(sentences)
|
67 |
+
if not sentences:
|
68 |
+
return ''
|
69 |
+
|
70 |
+
paragraphs = split_sent(sentences, en_pipe, max_len=180) if from_en else split_sent(sentences, ko_pipe)
|
71 |
+
#print(paragraphs)
|
72 |
+
|
73 |
+
ret = []
|
74 |
+
for text in paragraphs:
|
75 |
+
result = en_pipe(text) if from_en else ko_pipe(text)
|
76 |
+
ret.append(result[0]['translation_text'])
|
77 |
+
|
78 |
+
translated = ' '.join(ret)
|
79 |
+
gc.collect()
|
80 |
+
|
81 |
+
if gpt_fix:
|
82 |
+
if lang == 'νμ':
|
83 |
+
prompt = 'Improve given formal article without adding:'
|
84 |
+
elif lang == 'μν':
|
85 |
+
prompt = "μΆκ°μ μΈ λ΄μ©μμ΄ μ£Όμ΄μ§ κΈμ κ°μ ν΄:"
|
86 |
+
|
87 |
+
def fix_sent(sent):
|
88 |
+
number_of_tokens = len(gpt2_tokenizer(sent)['input_ids'])
|
89 |
+
|
90 |
+
response = openai.Completion.create(
|
91 |
+
model="text-davinci-003",
|
92 |
+
prompt=prompt+'\n'+sent,
|
93 |
+
temperature=0,
|
94 |
+
max_tokens=number_of_tokens+128,
|
95 |
+
top_p=1.0,
|
96 |
+
frequency_penalty=0.0,
|
97 |
+
presence_penalty=0.0
|
98 |
+
)
|
99 |
+
|
100 |
+
return response.choices[0].text.strip()
|
101 |
+
|
102 |
+
# def fix_sent(sent):
|
103 |
+
# generated = kogpt_api.generate(prompt+'\n'+sent, max_tokens=256)
|
104 |
+
# return generated['generations'][0]['text']
|
105 |
+
|
106 |
+
translated = fix_sent(translated)
|
107 |
+
|
108 |
+
return translated
|
109 |
+
|
110 |
+
#%%
|
111 |
+
def translate_with_sum(text, lang, gpt_fix=False):
|
112 |
+
from_en = False if lang == 'νμ' else True
|
113 |
+
|
114 |
+
if lang == 'μν':
|
115 |
+
summary = en_sum(text, max_length=int(len_tokens(text, en_sum)/2)+32)
|
116 |
+
text = summary[0]['summary_text']
|
117 |
+
|
118 |
+
sentences = sent_tokenize(text) if from_en else kkma.sentences(text)
|
119 |
+
#print(sentences)
|
120 |
+
if not sentences:
|
121 |
+
return ''
|
122 |
+
|
123 |
+
paragraphs = split_sent(sentences, en_pipe if from_en else ko_pipe)
|
124 |
+
#print(paragraphs)
|
125 |
+
|
126 |
+
ret = []
|
127 |
+
for text in paragraphs:
|
128 |
+
result = en_pipe(text) if from_en else ko_pipe(text)
|
129 |
+
ret.append(result[0]['translation_text'])
|
130 |
+
|
131 |
+
summarized = ' '.join(ret)
|
132 |
+
if lang == 'νμ':
|
133 |
+
summary = en_sum(summarized, max_length=int(len_tokens(summarized, en_sum)/2)+32)
|
134 |
+
return summary[0]['summary_text']
|
135 |
+
|
136 |
+
gc.collect()
|
137 |
+
return summarized
|
138 |
+
|
139 |
+
def summarize(text, lang):
|
140 |
+
if lang == 'Korean':
|
141 |
+
summarizer = ko_sum
|
142 |
+
elif lang == 'English':
|
143 |
+
summarizer = en_sum
|
144 |
+
|
145 |
+
summary = summarizer(text, max_length=int(len_tokens(text, summarizer) * 0.7))[0]['summary_text']
|
146 |
+
return summary
|
147 |
+
|
148 |
+
def translate_styleonly(text):
|
149 |
+
sentences = kkma.sentences(text)
|
150 |
+
paragraphs = split_sent(sentences, style_pipe, max_len=180)
|
151 |
+
#print(paragraphs)
|
152 |
+
|
153 |
+
ret = []
|
154 |
+
for text in paragraphs:
|
155 |
+
result = style_pipe(text)
|
156 |
+
ret.append(result[0]['translation_text'])
|
157 |
+
|
158 |
+
translated = ' '.join(ret)
|
159 |
+
gc.collect()
|
160 |
+
|
161 |
+
return translated
|
162 |
+
|
163 |
+
# %%
|
164 |
+
interface1 = gr.Interface(fn=translate, inputs=["text", gr.Radio(["μν", "νμ"], value='μν'), 'checkbox'], outputs="text", batch=True, max_batch_size=8)
|
165 |
+
interface2 = gr.Interface(fn=translate_with_sum, inputs=["text", gr.Radio(["μν", "νμ"], value='μν')], outputs="text", batch=True, max_batch_size=8)
|
166 |
+
parallel_interface = gr.Parallel(interface1, interface2)
|
167 |
+
|
168 |
+
summarize_interface = gr.Interface(fn=summarize, inputs=["text", gr.Radio(["Korean", "English"], value='Korean')], outputs="text", batch=True, max_batch_size=8)
|
169 |
+
style_interface = gr.Interface(fn=translate_styleonly, inputs=["text"], outputs="text", batch=True, max_batch_size=8)
|
170 |
+
#%%
|
171 |
+
demo = gr.TabbedInterface([parallel_interface, summarize_interface, style_interface], ['λ²μ λ° μμ½', 'μμ½', 'μ€νμΌ λ²μ'], css="footer {visibility: hidden}") # 'μμ½'
|
172 |
+
demo.queue()
|
173 |
+
demo.launch(share=True) # Share the demo
|
174 |
+
# %%
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
nltk
|
2 |
+
konlpy
|
3 |
+
openai
|