lcw99's picture
Update app.py
d54697e
import streamlit as st
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
nltk.download('punkt')
model_dir = "lcw99/t5-base-korean-text-summary"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
max_input_length = 512
sample_text = """
주인곡 강인ꡬ(ν•˜μ •μš°)λŠ” β€˜μˆ˜λ¦¬λ‚¨μ—μ„œ 홍어가 많이 λ‚˜λŠ”λ° λ‹€ κ°–λ‹€λ²„λ¦°λ‹€β€™λŠ” 친ꡬ
λ°•μ‘μˆ˜(ν˜„λ΄‰μ‹)의 μ–˜κΈ°λ₯Ό λ“£κ³  μˆ˜λ¦¬λ‚¨μ‚° 홍어λ₯Ό ν•œκ΅­μ— μˆ˜μΆœν•˜κΈ° μœ„ν•΄ μˆ˜λ¦¬λ‚¨μœΌλ‘œ κ°„λ‹€.
κ΅­λ¦½μˆ˜μ‚°κ³Όν•™μ› 츑은 β€œμ‹€μ œλ‘œ λ‚¨λŒ€μ„œμ–‘μ— 홍어가 많이 μ‚΄κ³  μ•„λ₯΄ν—¨ν‹°λ‚˜λ₯Ό λΉ„λ‘―ν•œ 남미 κ΅­κ°€μ—μ„œ 홍어가 많이 μž‘νžŒλ‹€β€λ©°
β€œμˆ˜λ¦¬λ‚¨ μ—°μ•ˆμ—λ„ 홍어가 많이 μ„œμ‹ν•  것”이라고 μ„€λͺ…ν–ˆλ‹€.
κ·ΈλŸ¬λ‚˜ 관세청에 λ”°λ₯΄λ©΄ ν•œκ΅­μ— μˆ˜λ¦¬λ‚¨μ‚° 홍어가 μˆ˜μž…λœ 적은 μ—†λ‹€.
일각에선 β€œλˆμ„ 벌기 μœ„ν•΄ μˆ˜λ¦¬λ‚¨μ‚° 홍어λ₯Ό κ΅¬ν•˜λŸ¬ κ°„ 섀정은 κ°œμ—°μ„±μ΄ λ–¨μ–΄μ§„λ‹€β€λŠ” 지적도 ν•œλ‹€.
λ“œλΌλ§ˆ 배경이 된 2008~2010λ…„μ—λŠ” 이미 ꡭ내에 μ•„λ₯΄ν—¨ν‹°λ‚˜, 칠레, λ―Έκ΅­ λ“± 아메리카산 홍어가 μˆ˜μž…λ˜κ³  μžˆμ—ˆκΈ° λ•Œλ¬Έμ΄λ‹€.
μ‹€μ œ 쑰봉행 체포 μž‘μ „μ— ν˜‘μ‘°ν–ˆλ˜ β€˜ν˜‘λ ₯자 K씨’도 홍어 사업이 μ•„λ‹ˆλΌ μˆ˜λ¦¬λ‚¨μ— μ„ λ°•μš© νŠΉμˆ˜μš©μ ‘λ΄‰μ„ νŒŒλŠ” 사업을 ν•˜λŸ¬ μˆ˜λ¦¬λ‚¨μ— κ°”μ—ˆλ‹€.
"""
st.title("Summarize Text")
sentence = st.text_area('Please paste your Korean article :', value=sample_text, height=500)
button = st.button("Summarize")
min_len = st.sidebar.slider('Select output min', 10, 450, step=10, value=50)
max_len = st.sidebar.slider('Select output max', 50, 500, step=10, value=150)
do_sample = st.sidebar.checkbox("Do sample", value=True)
with st.spinner("Generating Summary.."):
if button and sentence:
inputs = ["summarize: " + sentence]
inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=do_sample, min_length=min_len, max_length=max_len)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
st.write(predicted_title)