Spaces:

jeongsoo
/

RAG5_SearchStrategy_FastSTT

Sleeping

App Files Files Community

RAG5_SearchStrategy_FastSTT / utils /document_processor.py

jeongsoo

Initial commit

1555e87 10 months ago

raw

history blame contribute delete

8.29 kB

	"""
	문서 처리 유틸리티 모듈
	"""

	import os
	import re
	import logging
	from typing import List, Dict, Any, Optional, Tuple, Union
	import numpy as np

	logger = logging.getLogger("DocProcessor")
	if not logger.hasHandlers():
	handler = logging.StreamHandler()
	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	handler.setFormatter(formatter)
	logger.addHandler(handler)
	logger.setLevel(logging.INFO)

	class DocumentProcessor:
	"""문서 처리 유틸리티 클래스"""

	@staticmethod
	def split_text(
	text: str,
	chunk_size: int = 512,
	chunk_overlap: int = 50,
	separator: str = "\n"
	) -> List[str]:
	"""
	텍스트를 더 작은 청크로 분할

	Args:
	text: 분할할 텍스트
	chunk_size: 각 청크의 최대 문자 수
	chunk_overlap: 청크 간 중첩되는 문자 수
	separator: 분할 시 사용할 구분자

	Returns:
	분할된 텍스트 청크 목록
	"""
	if not text or chunk_size <= 0:
	return []

	# 구분자로 분할
	parts = text.split(separator)
	chunks = []
	current_chunk = []
	current_size = 0

	for part in parts:
	part_size = len(part)

	if current_size + part_size + len(current_chunk) > chunk_size and current_chunk:
	# 현재 청크가 최대 크기를 초과하면 저장
	chunks.append(separator.join(current_chunk))

	# 중첩을 위해 일부 청크 유지
	overlap_tokens = []
	overlap_size = 0
	for token in reversed(current_chunk):
	if overlap_size + len(token) <= chunk_overlap:
	overlap_tokens.insert(0, token)
	overlap_size += len(token) + 1 # separator 길이 포함
	else:
	break

	current_chunk = overlap_tokens
	current_size = overlap_size - len(current_chunk) # separator 길이 제외

	current_chunk.append(part)
	current_size += part_size

	# 마지막 청크 추가
	if current_chunk:
	chunks.append(separator.join(current_chunk))

	return chunks

	@staticmethod
	def clean_text(text: str, remove_urls: bool = True, remove_extra_whitespace: bool = True) -> str:
	"""
	텍스트 정제

	Args:
	text: 정제할 텍스트
	remove_urls: URL 제거 여부
	remove_extra_whitespace: 여분의 공백 제거 여부

	Returns:
	정제된 텍스트
	"""
	if not text:
	return ""

	# URL 제거
	if remove_urls:
	text = re.sub(r'https?://\S+\|www\.\S+', '', text)

	# 특수 문자 및 HTML 태그 정제
	text = re.sub(r'<.*?>', '', text) # HTML 태그 제거

	# 여분의 공백 제거
	if remove_extra_whitespace:
	text = re.sub(r'\s+', ' ', text).strip()

	return text

	@staticmethod
	def text_to_documents(
	text: str,
	metadata: Optional[Dict[str, Any]] = None,
	chunk_size: int = 512,
	chunk_overlap: int = 50
	) -> List[Dict[str, Any]]:
	"""
	텍스트를 문서 객체 목록으로 변환

	Args:
	text: 변환할 텍스트
	metadata: 문서에 추가할 메타데이터
	chunk_size: 각 청크의 최대 문자 수
	chunk_overlap: 청크 간 중첩되는 문자 수

	Returns:
	문서 객체 목록
	"""
	if not text:
	return []

	# 텍스트 정제
	clean = DocumentProcessor.clean_text(text)

	# 텍스트 분할
	chunks = DocumentProcessor.split_text(
	clean,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)

	# 문서 객체 생성
	documents = []
	for i, chunk in enumerate(chunks):
	doc = {
	"text": chunk,
	"index": i,
	"chunk_count": len(chunks)
	}

	# 메타데이터 추가
	if metadata:
	doc.update(metadata)

	documents.append(doc)

	return documents

	@staticmethod
	def load_documents_from_directory(
	directory: str,
	extensions: List[str] = [".txt", ".md", ".csv"],
	recursive: bool = True,
	chunk_size: int = 512,
	chunk_overlap: int = 50
	) -> List[Dict[str, Any]]:
	"""
	디렉토리에서 문서 로드 및 처리

	Args:
	directory: 로드할 디렉토리 경로
	extensions: 처리할 파일 확장자 목록
	recursive: 하위 디렉토리 검색 여부
	chunk_size: 각 청크의 최대 문자 수
	chunk_overlap: 청크 간 중첩되는 문자 수

	Returns:
	문서 객체 목록
	"""
	if not os.path.isdir(directory):
	logger.error(f"디렉토리를 찾을 수 없습니다: {directory}")
	return []

	documents = []

	for root, dirs, files in os.walk(directory):
	if not recursive and root != directory:
	continue

	for file in files:
	_, ext = os.path.splitext(file)
	if ext.lower() not in extensions:
	continue

	file_path = os.path.join(root, file)
	rel_path = os.path.relpath(file_path, directory)

	try:
	logger.info(f"파일 로드 중: {rel_path}")
	# 먼저 UTF-8로 시도
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	except UnicodeDecodeError:
	# UTF-8로 실패하면 CP949(한국어 Windows 기본 인코딩)로 시도
	logger.info(f"UTF-8 디코딩 실패, CP949로 시도: {rel_path}")
	with open(file_path, 'r', encoding='cp949') as f:
	content = f.read()

	# 메타데이터 생성
	metadata = {
	"source": rel_path,
	"filename": file,
	"filetype": ext.lower()[1:],
	"filepath": file_path
	}

	# 문서 처리
	file_docs = DocumentProcessor.text_to_documents(
	content,
	metadata=metadata,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)

	documents.extend(file_docs)
	logger.info(f"{len(file_docs)}개 청크 추출: {rel_path}")

	except Exception as e:
	logger.error(f"파일 '{rel_path}' 처리 중 오류 발생: {e}")
	continue

	logger.info(f"총 {len(documents)}개 문서 청크를 로드했습니다.")
	return documents

	@staticmethod
	def prepare_rag_context(results: List[Dict[str, Any]], field: str = "text") -> List[str]:
	"""
	검색 결과에서 RAG에 사용할 컨텍스트 추출

	Args:
	results: 검색 결과 목록
	field: 텍스트 내용이 있는 필드 이름

	Returns:
	컨텍스트 텍스트 목록
	"""
	context = []

	for result in results:
	if field in result:
	context.append(result[field])

	return context