Spaces:

yhshin
/

kr-article-summarizer

Runtime error

File size: 9,775 Bytes

import json
import numpy as np
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load all test data into list of dictionaries
#summary_data_path = 'sci-news-sum-kr-50/data/'
#summary_objects = []
#for root, dirs, files in os.walk(summary_data_path):
#    files.sort() # Sort file names
#    for ifile, file_name in enumerate(files):
#        with open(os.path.join(root, file_name)) as f:
#            s = json.load(f)
#            s['index'] = file_name.replace('.json','') # index = 'XY' for file 'XY.json' 
#            s['sentences'] = [sen + '.' for sen in s['sentences']] # Add punctuation to all sentences
#            s['body'] = ' '.join(s['sentences']) # body is all sentenecs concantenatd with spaces in between
#            summary_objects.append(s)

# Load spacy to split text into sentences
import spacy

# Cache language model
nlp = spacy.load("ko_core_news_sm")
nlp.select_pipes(disable=
                        ['tok2vec','tagger','morphologizer','parser','lemmatizer','attribute_ruler','ner']
                        )
nlp.enable_pipe('senter')

def text_to_sentences(nlp, text):
    """Split Korean text into sentences."""
    doc = nlp(text)
    sentences = [sen for sen in doc.sents]
    return sentences

from transformers import AutoConfig, AutoTokenizer, AutoModel
from summarizer import Summarizer

model_path = 'skt/kobert-base-v1'

# Load model, model config and tokenizer via Transformers
custom_config = AutoConfig.from_pretrained(model_path)
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained(model_path, do_lower_case=False)
custom_model = AutoModel.from_pretrained(model_path, config=custom_config)
model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)

def create_summary(nlp, model, text):
    """Create summary from text of an article using given model"""
    
    # print(model(s['body']))
    k = model.calculate_optimal_k(text, k_max=10)
    return text_to_sentences(nlp, model(text, num_sentences=k))

from urllib.request import urlopen
from bs4 import BeautifulSoup

def extract_naver_news(url):
    """Get title, subtitle, and article body from Naver news"""
    html = urlopen(url).read()
    soup = BeautifulSoup(html, features="html.parser")
    
    title = soup.find(class_="media_end_head_headline").get_text()

    area = soup.find(id="dic_area")

    subtitle_tag = area.find('strong')
    if subtitle_tag: subtitle = area.strong.get_text('\n')
    else: subtitle = ''
    
    for tag in area.find_all(class_="img_desc"):
        tag.extract()
        
    # Add punctuation and spaces between sentences
    article = ' '.join( [text for text in area.stripped_strings if text[-1]=='.'] )
    result = {
        'title': title,
        'subtitle': subtitle,
        'article': article,
    }
    return result

import gradio as gr
def interface_handler(custom_text, naver_url, choice):
    if choice == 1:
        content = extract_naver_news(naver_url)
        summary_sentences = create_summary(nlp, model, content['article'])
        output_text = ""
        # output_text += f'제목:\n{content["title"]}\n'
        # output_text += f'부제:\n{content["subtitle"]}\n'
        # output_text += '\n개요:\n'
        for sen in summary_sentences:
            output_text += f'{sen}\n\n'
        return output_text
    else:
        output_text = ""
        summary_sentences = create_summary(nlp, model, custom_text)
        for sen in summary_sentences:
            output_text += f'{sen}\n\n'
        return output_text

default_url = "https://n.news.naver.com/article/015/0004692703?sid=102"
default_text = """
'나선형 신경망' 학습 기술. 카메라로 찍은 이미지에서 특정한 사물 찾는 기술 활용. 숲속에서 등산로 척척 찾아 드론이 산악구조대 역할도. 미국 국방부는 지난달 말 인공지능(AI)을 이용해 인간 도움 없이 적을 식별해 타격하는 드론(무인 항공기)을 시연했다. 이 드론은 카메라 화면에서 총으로 무장한 사람과 무기가 없는 사람을 구분할 수 있다. 표적으로 정한 사람을 찾아 그가 탄 자동차를 추적하는 기능도 있다. 조만간 원격 조종 없이도 전장에서 특수부대 군인들처럼 임무를 수행하는 드론이 등장할 전망이다. 이 드론이 사람 도움 없이 카메라 영상에서 목표물을 인식하고 추적할 수 있는 것은 바로 ‘머신러닝’ 덕분이다. 머신러닝은 AI의 한 분야로 컴퓨터가 인간처럼 스스로 학습할 수 있는 능력을 부여하는 작업을 말한다. 머신러닝의 원리는 인간을 포함한 영장류 두뇌의 정보 처리 구조인 ‘신경망’을 모사하는 방식이다. 바둑 대결에서 이세돌 9단을 이긴 구글의 ‘알파고’ 등 지금까지 소개된 AI 대부분은 심층신경망을 기반으로 한 머신러닝 알고리즘을 이용한다. 이미지에서 특정 사물을 찾는 기술은 인간이 아니라 고양이 뇌에서 유래했다. 고양이 뇌의 시신경에서 발견되는 ‘나선형 신경망’ 구조는 시각세포들이 보내오는 반응을 모아 여러 개의 층(層)으로 나눈다. 이를 3단계에 걸쳐 점차적으로 단순화하면서 물체의 색깔이나 모양을 파악한다. 이를 처음으로 연구한 데이비드 휴벨과 토어스텐 비젤은 1981년 노벨 생리의학상을 받았다. AI 과학자들은 나선형 신경망에서 아이디어를 얻어 이미지에서 사물을 판별하는 알고리즘을 설계했다. 우선 이미지에서 큰 특징을 추출한 다음 점차 작고 복잡한 특징을 발견해 나가는 방식이다. 예컨대 사진 속에 자동차가 있다고 해 보자. 알고리즘은 우선 사물의 전체적인 윤곽을 먼저 확인한 뒤 기존에 입력된 사진 데이터와 비교해 ‘탈 것’으로 범위를 좁힌다. 이후 타이어나 제조사 엠블럼처럼 세부적인 특징을 파악하고 ‘사진 속에 있는 물체는 자동차’라는 결론을 내리게 된다. 제프 딘 구글 수석연구원은 “나선형 신경망은 다른 머신러닝 구조들과 비교할 때 영상, 음성 분야에서 좋은 성능을 보인다”며 “이를 이용하면 컴퓨터가 처음 본 사물도 무엇인지 파악할 수 있다”고 설명했다. 주변에서 볼 수 있는 영상촬영용 드론에도 이보다는 간단하지만 비슷한 기술이 이용된다. 세계 1위 드론업체인 중국 DJI의 ‘팬텀4’는 사람 눈처럼 두 개의 카메라 센서를 장착했다. 이를 통해 대상 물체를 확인하고 일정 거리를 유지하면서 따라다닌다. 이른바 ‘액티브 트랙’ 기능이다. 액티브 트랙 기능을 켜면 이용자가 지정한 사물이나 사람의 윤곽선을 인식하고 픽셀(이미지를 구성하는 가장 작은 단위인 네모 모양의 점) 단위로 인식한다. 그 픽셀을 계속적으로 같은 크기로 유지하기 위해 기체가 이동한다. 예컨대 주변에 있는 사람을 지정했을 때 픽셀 크기가 상하좌우 100×100 픽셀이었다고 해 보자. 그 사람이 앞으로 움직여서 80×80 픽셀 크기로 줄어들면 원래 수치인 100×100 픽셀을 되찾기 위해 드론도 따라서 앞으로 움직이는 방식이다. 과학자들은 나선형 신경망을 본뜬 머신러닝 기술을 응용해 인간 삶을 윤택하게 할 수 있는 기술을 개발하고 있다. 스위스 취리히대 연구팀은 드론을 이용해 알프스 산맥에서 조난자를 찾는 기술을 연구 중이다. 연구팀이 개발한 AI 드론은 카메라가 촬영한 이미지를 이용해 숲이 우거진 곳과 등산로를 구분한다. 이를 드론의 비행 제어기로 전달해 이동 방향을 결정한다. 올해 초 취리히대가 완료한 첫 실험에서는 ‘드론이 인간보다 등산로를 잘 찾는다’는 결과가 나왔다. 연구팀은 약 2만장의 알프스 산 등산로 사진을 바탕으로 3일간 드론에 탑재된 인공지능의 심층신경망을 학습시켰다. 이후 드론이 전혀 가보지 못한 등산로를 오르도록 했다. 실험 결과 사람 눈으로 새로운 등산로를 식별할 확률은 82%였으나 AI 드론은 85%의 성공률을 보여줬다. 취리히대 연구팀은 “AI 드론은 조만간 실전에 투입돼 산악구조대가 조난자를 찾는 일을 도울 수 있을 것”이라고 말했다. 신경망 학습 기술은 다양한 용도로 활용할 수 있다. 문태현 DJI코리아 대표는 “AI를 탑재한 드론은 송전선이나 송유관 등 산업시설물의 결함 발견, 산불 감지, 장애물이나 군사용 목표물 탐지 등 이용 가능 범위가 무궁무진하다”고 말했다."),
"""

title = "AI 문서 요약\nKorean text summarization"
with open('description.md',mode='r') as file:
    description = file.read()
with open('article.md',mode='r') as file:
    article = file.read()


demo = gr.Interface(
    fn=interface_handler,
    inputs=[
              gr.inputs.Textbox(lines=5, placeholder=None, default=default_text, label="임의 문서 (Custom text)", optional=False),
              gr.inputs.Textbox(lines=1, placeholder=None, default=default_url, label="네이버 뉴스 기사 링크주소 (Naver News article URL)", optional=False),
              gr.inputs.Radio(["입력 문서 요약", "네이버 뉴스 기사 요약"], type="index", default=None, label="옵션", optional=False)
           ],
    outputs=[
            gr.outputs.Textbox(label="개요"),
            ],
    title=title,
    description=description,
    article=article,
)

if __name__ == "__main__":
    demo.launch(debug=True)