Spaces:

wiizm
/

soyailabs

Running on CPU Upgrade

App Files Files Community

wiizm commited on 2 days ago

Commit

730c79f

verified ·

1 Parent(s): 026d511

Upload app\utils\text_utils.py with huggingface_hub

Browse files

Files changed (1) hide show

app//utils//text_utils.py +197 -0

app//utils//text_utils.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""
+텍스트 처리 유틸리티 함수
+"""
+import re
+from typing import List, Optional
+from app.core.logger import get_logger
+logger = get_logger(__name__)
+def clean_text(text: str) -> str:
+    """
+    텍스트 정리 (공백 정규화 등)
+    Args:
+        text: 정리할 텍스트
+    Returns:
+        정리된 텍스트
+    """
+    if not text:
+        return ''
+    # 연속된 공백 제거
+    text = re.sub(r'\s+', ' ', text)
+    # 앞뒤 공백 제거
+    text = text.strip()
+    return text
+def split_text_into_chunks(
+    text: str,
+    min_chunk_size: int = 200,
+    max_chunk_size: int = 1000,
+    overlap: int = 150
+) -> List[str]:
+    """
+    의미 기반 텍스트 청킹 (문장과 문단 경계를 고려하여 분할)
+    Args:
+        text: 분할할 텍스트
+        min_chunk_size: 최소 청크 크기
+        max_chunk_size: 최대 청크 크기
+        overlap: 오버랩 크기
+    Returns:
+        분할된 청크 리스트
+    """
+    if not text or len(text.strip()) == 0:
+        return []
+    # 1단계: 문단 단위로 분할 (빈 줄 기준)
+    paragraphs = re.split(r'\n\s*\n', text.strip())
+    paragraphs = [p.strip() for p in paragraphs if p.strip()]
+    if not paragraphs:
+        return []
+    # 2단계: 각 문단을 문장 단위로 분할
+    sentence_pattern = r'([.!?]+)(?=\s+|$)'
+    all_sentences: List[str] = []
+    for para in paragraphs:
+        parts = re.split(sentence_pattern, para)
+        combined_sentences: List[str] = []
+        current_sentence = ""
+        for part in parts:
+            if not part.strip():
+                continue
+            if re.match(r'^[.!?]+$', part):
+                # 구두점인 경우 현재 문장에 추가하고 문장 완성
+                current_sentence += part
+                if current_sentence.strip():
+                    combined_sentences.append(current_sentence.strip())
+                current_sentence = ""
+            else:
+                # 텍스트인 경우 현재 문장에 추가
+                current_sentence += part
+        # 마지막 문장 처리
+        if current_sentence.strip():
+            combined_sentences.append(current_sentence.strip())
+        # 문장이 하나도 없는 경우
+        if not combined_sentences and para.strip():
+            combined_sentences.append(para.strip())
+        all_sentences.extend(combined_sentences)
+    if not all_sentences:
+        return [text] if text.strip() else []
+    # 3단계: 문장들을 모아서 의미 있는 청크 생성
+    chunks: List[str] = []
+    current_chunk: List[str] = []
+    current_size = 0
+    for sentence in all_sentences:
+        sentence_size = len(sentence)
+        # 현재 청크에 문장 추가 시 최대 크기를 초과하는 경우
+        if current_size + sentence_size > max_chunk_size and current_chunk:
+            # 현재 청크 저장
+            chunk_text = '\n'.join(current_chunk)
+            if len(chunk_text.strip()) >= min_chunk_size:
+                chunks.append(chunk_text)
+            else:
+                # 최소 크기 미만이면 다음 청크와 병합
+                if chunks:
+                    chunks[-1] = chunks[-1] + '\n' + chunk_text
+                else:
+                    chunks.append(chunk_text)
+            # 오버랩을 위한 문장 유지
+            overlap_sentences: List[str] = []
+            overlap_size = 0
+            for s in reversed(current_chunk):
+                if overlap_size + len(s) <= overlap:
+                    overlap_sentences.insert(0, s)
+                    overlap_size += len(s) + 1
+                else:
+                    break
+            current_chunk = overlap_sentences + [sentence]
+            current_size = overlap_size + sentence_size
+        else:
+            # 현재 청크에 문장 추가
+            current_chunk.append(sentence)
+            current_size += sentence_size + 1
+    # 마지막 청크 추가
+    if current_chunk:
+        chunk_text = '\n'.join(current_chunk)
+        if chunks and len(chunk_text.strip()) < min_chunk_size:
+            chunks[-1] = chunks[-1] + '\n' + chunk_text
+        else:
+            chunks.append(chunk_text)
+    # 빈 청크 제거 및 최소 크기 미만 청크 처리
+    final_chunks: List[str] = []
+    for chunk in chunks:
+        chunk = chunk.strip()
+        if chunk and len(chunk) >= min_chunk_size:
+            final_chunks.append(chunk)
+        elif chunk:
+            if final_chunks:
+                final_chunks[-1] = final_chunks[-1] + '\n' + chunk
+            else:
+                final_chunks.append(chunk)
+    return final_chunks if final_chunks else [text] if text.strip() else []
+def extract_chapter_number(text: str) -> Optional[int]:
+    """
+    텍스트에서 챕터 번호 추출
+    Args:
+        text: 챕터 번호를 추출할 텍스트
+    Returns:
+        챕터 번호, 없으면 None
+    """
+    # 다양한 챕터 패턴 매칭
+    patterns = [
+        r'제\s*(\d+)\s*장',  # 제1장, 제 1 장
+        r'제\s*(\d+)\s*화',  # 제1화
+        r'Chapter\s*(\d+)',  # Chapter 1
+        r'CHAPTER\s*(\d+)',  # CHAPTER 1
+        r'Ch\.\s*(\d+)',     # Ch. 1
+        r'(\d+)\s*장',       # 1장
+        r'(\d+)\s*화',       # 1화
+        r'chap\.\s*(\d+)',   # chap. 1
+        r'ch\s*(\d+)',       # ch 1
+        r'(\d+)\s*章',       # 1章
+    ]
+    # 텍스트의 처음 500자만 검사
+    search_text = text[:500]
+    for pattern in patterns:
+        match = re.search(pattern, search_text, re.IGNORECASE)
+        if match:
+            try:
+                chapter_num = int(match.group(1))
+                return chapter_num
+            except (ValueError, AttributeError):
+                continue
+    return None