KSSDS: Korean Sentence Splitter for Dialogue Systems

KSSDS는 lcw99/t5-base-korean-text-summary의 인코더를 기반으로 하여,
AI Hub 데이터를 이용해 fine-tuning한 모델입니다.

이 모델은 한국어 대화 시스템 용 문장 분리기로,
Whisper와 같은 STT 모델이 생성한 한국어 텍스트를 문장 단위로 분리하는 것을 목표로 만들어졌습니다.

자세한 설명은 KSSDS GitHub repository를 참고해주세요.

사용 방법

1. PyPI 또는 GitHub 설치를 통한 사용 (권장)

PyPI 또는 GitHub를 통해 설치하면 더욱 더 편리하게 사용하실 수 있습니다.
KSSDS GitHub repository에서 설정 및 사용 방법을 확인하세요.

2. Hugging Face Hub 네이티브 방식 사용

Hugging Face Hub에서 모델과 T5 인코더를 직접 다운로드하여 사용할 수 있습니다.
아래는 문장 분리를 수행하는 전체 파이프라인의 예제 코드입니다.

# 필요한 라이브러리 및 모듈 불러오기
from huggingface_hub import hf_hub_download  # Hugging Face Hub에서 파일 다운로드
import sys
import os
from transformers import AutoTokenizer  # Hugging Face Tokenizer
import torch
from typing import List

# Hugging Face Hub에서 T5_encoder.py와 모델 다운로드
model_name = "ggomarobot/KSSDS"
file_path = hf_hub_download(repo_id=model_name, filename="T5_encoder.py")  # T5_encoder.py 파일 다운로드
module_dir = os.path.dirname(file_path)  # T5_encoder.py 파일이 있는 디렉터리 경로
sys.path.append(module_dir)  # Python 경로에 T5_encoder 디렉터리 추가

from T5_encoder import T5ForTokenClassification  # 커스텀 T5 인코더 모델

tokenizer = AutoTokenizer.from_pretrained(model_name)  # 토크나이저 불러오기
model = T5ForTokenClassification.from_pretrained(model_name)  # 커스텀 T5 인코더 모델 불러오기

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to GPU
model = model.to(device)
model.eval()  # 모델을 평가 모드로 설정

# 최대 입력 길이 설정
max_length = 512

def preprocess_text(input_text: str) -> List[dict]:
    """
    긴 입력 텍스트를 처리 가능한 청크로 나누는 함수.

    Args:
        input_text (str): 처리할 긴 입력 텍스트.
    Returns:
        List[dict]: 청크 단위의 input_ids와 attention_mask를 포함한 리스트.
    """
    tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
    chunks = [tokenized_sequence[i:i + max_length] for i in range(0, len(tokenized_sequence), max_length)]
    processed_data = [{"input_ids": chunk, "attention_mask": [1] * len(chunk)} for chunk in chunks]
    return processed_data

def handle_repetitions(sentences: List[str], max_repeats: int = 60, detection_threshold: int = 70, max_phrase_length: int = 2) -> List[str]:
    """
    Handles single-word and phrase repetitions in a list of sentences, ensuring proper order and separation.

    Args:
        sentences (List[str]): List of input sentences to process.
        max_repeats (int): Maximum number of phrase repetitions to allow before splitting.
        detection_threshold (int): Minimum length of text for repetition detection.
        max_phrase_length (int): Maximum length of a phrase to consider for repetition detection.

    Returns:
        List[str]: The processed text split into sentences.
    """
    processed_sentences = []

    for sentence in sentences:
        words = sentence.split()
        if len(words) <= detection_threshold:
            processed_sentences.append(sentence)
            continue

        result_sentences = []
        current_sentence = []
        current_repetition = []

        def flush_sentence():
            """Flush the current sentence into result_sentences."""
            if current_sentence:
                result_sentences.append(" ".join(current_sentence))
                current_sentence.clear()

        def flush_repetition(phrase_length):
            """Flush the current repetition into result_sentences."""
            for i in range(0, len(current_repetition), phrase_length * max_repeats):
                chunk = current_repetition[i:i + phrase_length * max_repeats]
                result_sentences.append(" ".join(chunk))
            current_repetition.clear()

        def find_repeating_phrase(start_idx):
            """Find the smallest repeating phrase starting at the given index."""
            for phrase_length in range(1, max_phrase_length + 1):
                phrase = words[start_idx:start_idx + phrase_length]
                next_idx = start_idx + phrase_length
                if next_idx + phrase_length <= len(words) and words[next_idx:next_idx + phrase_length] == phrase:
                    return phrase
            return None

        i = 0
        while i < len(words):
            repeating_phrase = find_repeating_phrase(i)
            if repeating_phrase:
                # Flush any ongoing sentence before handling repetition
                flush_sentence()

                # Accumulate repeating phrases
                phrase_length = len(repeating_phrase)
                while i + phrase_length <= len(words) and words[i:i + phrase_length] == repeating_phrase:
                    current_repetition.extend(repeating_phrase)
                    i += phrase_length

                # Flush accumulated repetition if it reaches the threshold
                if len(current_repetition) >= phrase_length * max_repeats:
                    flush_repetition(phrase_length)
            else:
                # Add non-repeating words to the current sentence
                if current_repetition:
                    # Flush repetition before starting a new sentence
                    flush_repetition(1)  # Default to single-word repetition
                current_sentence.append(words[i])
                i += 1

        # Flush any remaining tokens
        flush_sentence()
        flush_repetition(1)  # Default to single-word repetition for the last chunk

        processed_sentences.extend(result_sentences)

    return processed_sentences

def segment_predictions(input_ids: List[int], predictions: List[int]) -> List[List[int]]:
    """
    Segment predictions into sentences based on label 1 (sentence-ending).
    Args:
        input_ids (List[int]): List of input token IDs.
        predictions (List[int]): Corresponding prediction labels.
    Returns:
        List[List[int]]: Segmented sentences as lists of token IDs.
    """
    segments = []
    current_segment = []

    for token, label in zip(input_ids, predictions):
        if label == 1:  # Sentence-ending label
            if current_segment:
                current_segment.append(token)
                segments.append(current_segment)
                current_segment = []
            else:
                segments.append([token])
        else:
            current_segment.append(token)

    if current_segment:  # Append any remaining tokens
        segments.append(current_segment)

    return segments


def decode_predictions(input_ids: List[int], predictions: List[int], tokenizer, carry_over=None):
    """
    Decode model predictions into sentences, handling carry-over tokens across chunks.
    Args:
        input_ids (List[int]): Input token IDs.
        predictions (List[int]): Prediction labels.
        tokenizer: Hugging Face tokenizer instance.
        carry_over (List[int], optional): Tokens carried over from the previous chunk.
    Returns:
        Tuple[List[str], List[int]]: Decoded sentences and remaining carry-over tokens.
    """
    if carry_over is None:
        carry_over = []

    sentences = []
    tokens = carry_over + input_ids  # Include carry-over tokens
    labels = [0] * len(carry_over) + predictions  # Carry-over tokens have label 0

    segmented = segment_predictions(tokens, labels)

    for segment in segmented[:-1]:  # Decode all segments except the last one
        sentence = tokenizer.decode(segment, skip_special_tokens=False, clean_up_tokenization_spaces=False).strip()
        if sentence:
            sentences.append(sentence)

    # Handle carry-over for the last segment
    carry_over = segmented[-1] if segmented[-1] and labels[len(tokens) - len(segmented[-1])] != 1 else []

    return sentences, carry_over


def inference(input_text: str) -> List[str]:
    """
    Perform sentence splitting on input text using the HF Hub model.
    Args:
        input_text (str): Input text to split into sentences.
    Returns:
        List[str]: List of split sentences.
    """
    chunks = preprocess_text(input_text)
    carry_over_tokens = []
    sentences = []

    for chunk in chunks:
        # Prepare inputs for the model
        input_ids_tensor = torch.tensor([chunk["input_ids"]], dtype=torch.long, device=device)
        attention_mask_tensor = torch.tensor([chunk["attention_mask"]], dtype=torch.long, device=device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
            predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()

            # Ensure predictions is a list
            if isinstance(predictions, int):
                predictions = [predictions]

        input_ids = chunk["input_ids"]
        decoded_sentences, carry_over_tokens = decode_predictions(input_ids, predictions, tokenizer, carry_over_tokens)
        sentences.extend(decoded_sentences)

    # Process any remaining carry-over tokens
    if carry_over_tokens:
        remaining_sentence = tokenizer.decode(carry_over_tokens, skip_special_tokens=False, clean_up_tokenization_spaces=False).strip()
        if remaining_sentence:
            sentences.append(remaining_sentence)

    return handle_repetitions(sentences)

# 사용 예제
input_text = "안녕하세요. 오늘 날씨가 참 좋네요. 저는 산책을 나갈 예정입니다."
split_sentences = inference(input_text)

# 결과 출력
for idx, sentence in enumerate(split_sentences):
    print(f"{idx + 1}: {sentence}")

KSSDS_NO_LF 모델 소개

KSSDS_NO_LF는 ablation study를 위해 Length Filter를 적용하지 않고 학습된 모델입니다.
이 모델은 긴 텍스트를 처리할 때 상대적으로 덜 정밀한 문장 분리를 수행합니다.

사용 방식은 본 모델과 동일하지만
연구 및 비교 목적 외에는 실제 사용에 적합하지 않을 수 있습니다.

사용 방법

위의 Hugging Face Hub 네이티브 방식 사용 코드 스니펫에서 model_name을 "ggomarobot/KSSDS_NO_LF"로 교체하여 사용할 수 있습니다:

# Load tokenizer and model
model_name = "ggomarobot/KSSDS_NO_LF"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForTokenClassification.from_pretrained(model_name)

ggomarobot
/

KSSDS

KSSDS: Korean Sentence Splitter for Dialogue Systems

사용 방법

1. PyPI 또는 GitHub 설치를 통한 사용 (권장)

2. Hugging Face Hub 네이티브 방식 사용

KSSDS_NO_LF 모델 소개

사용 방법

Model tree for ggomarobot/KSSDS