RAG_voice / voice_rag_app.py
jeongsoo's picture
Add application file
4a98f26
"""
์Œ์„ฑ์ธ์‹ ๊ธฐ๋Šฅ์ด ์ถ”๊ฐ€๋œ RAG ์ฑ—๋ด‡ ์•ฑ
"""
import os
import time
import tempfile
from typing import List, Dict, Tuple, Any, Optional
import hashlib
import pickle
import json
# ๊ธฐ์กด ์ž„ํฌํŠธ
from config import PDF_DIRECTORY, CHUNK_SIZE, CHUNK_OVERLAP, LLM_MODEL
from optimized_document_processor import OptimizedDocumentProcessor
from vector_store import VectorStore
from langchain.schema import Document
# ํด๋กœ๋ฐ” STT ๋ชจ๋“ˆ ์ž„ํฌํŠธ
from clova_stt import ClovaSTT
# ์•ˆ์ „ํ•œ ์ž„ํฌํŠธ
try:
from rag_chain import RAGChain
RAG_CHAIN_AVAILABLE = True
except ImportError:
print("RAG ์ฒด์ธ ๋ชจ๋“ˆ์„ ๋กœ๋“œํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
RAG_CHAIN_AVAILABLE = False
class VoiceRAGChatApp:
"""
์Œ์„ฑ์ธ์‹ ๊ธฐ๋Šฅ์ด ์ถ”๊ฐ€๋œ RAG ์ฑ—๋ด‡ ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜
"""
def __init__(self):
"""
์Œ์„ฑ์ธ์‹ RAG ์ฑ—๋ด‡ ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜ ์ดˆ๊ธฐํ™”
"""
# ๋ฐ์ดํ„ฐ ๋””๋ ‰ํ† ๋ฆฌ ์ •์˜
self.pdf_directory = PDF_DIRECTORY
self.cache_directory = "cached_data"
self.index_file = os.path.join(self.cache_directory, "file_index.json")
self.chunks_dir = os.path.join(self.cache_directory, "chunks")
self.vector_index_dir = os.path.join(self.cache_directory, "vector_index")
# ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
os.makedirs(self.pdf_directory, exist_ok=True)
os.makedirs(self.cache_directory, exist_ok=True)
os.makedirs(self.chunks_dir, exist_ok=True)
os.makedirs(self.vector_index_dir, exist_ok=True)
print(f"PDF ๋ฌธ์„œ ๋””๋ ‰ํ† ๋ฆฌ: '{self.pdf_directory}'")
print(f"์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ: '{self.cache_directory}'")
# ์ปดํฌ๋„ŒํŠธ ์ดˆ๊ธฐํ™”
self.document_processor = OptimizedDocumentProcessor(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP
)
# ๋ฒกํ„ฐ ์ €์žฅ์†Œ ์ดˆ๊ธฐํ™”
self.vector_store = VectorStore(use_milvus=False)
# ๋ฌธ์„œ ์ธ๋ฑ์Šค ๋กœ๋“œ
self.file_index = self._load_file_index()
# ๊ธฐ๋ณธ ๋ณ€์ˆ˜ ์ดˆ๊ธฐํ™”
self.documents = []
self.processed_files = []
self.is_initialized = False
# ํด๋กœ๋ฐ” STT ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™”
self.stt_client = ClovaSTT()
print("์Œ์„ฑ์ธ์‹(STT) ๊ธฐ๋Šฅ์ด ์ดˆ๊ธฐํ™”๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
# ์‹œ์ž‘ ์‹œ ์ž๋™์œผ๋กœ ๋ฌธ์„œ ๋กœ๋“œ ๋ฐ ์ฒ˜๋ฆฌ
print("๋ฌธ์„œ ์ž๋™ ๋กœ๋“œ ๋ฐ ์ฒ˜๋ฆฌ ์‹œ์ž‘...")
self.auto_process_documents()
def _load_file_index(self) -> Dict[str, Dict[str, Any]]:
"""
ํŒŒ์ผ ์ธ๋ฑ์Šค ๋กœ๋“œ
Returns:
ํŒŒ์ผ ๊ฒฝ๋กœ -> ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋งคํ•‘
"""
if os.path.exists(self.index_file):
try:
with open(self.index_file, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"์ธ๋ฑ์Šค ํŒŒ์ผ ๋กœ๋“œ ์‹คํŒจ: {e}")
return {}
return {}
def _save_file_index(self) -> None:
"""
ํŒŒ์ผ ์ธ๋ฑ์Šค ์ €์žฅ
"""
with open(self.index_file, 'w', encoding='utf-8') as f:
json.dump(self.file_index, f, ensure_ascii=False, indent=2)
def _calculate_file_hash(self, file_path: str) -> str:
"""
ํŒŒ์ผ ํ•ด์‹œ ๊ณ„์‚ฐ
Args:
file_path: ํŒŒ์ผ ๊ฒฝ๋กœ
Returns:
MD5 ํ•ด์‹œ๊ฐ’
"""
hasher = hashlib.md5()
with open(file_path, 'rb') as f:
buf = f.read(65536)
while len(buf) > 0:
hasher.update(buf)
buf = f.read(65536)
return hasher.hexdigest()
def _is_file_processed(self, file_path: str) -> bool:
"""
ํŒŒ์ผ์ด ์ด๋ฏธ ์ฒ˜๋ฆฌ๋˜์—ˆ๊ณ  ๋ณ€๊ฒฝ๋˜์ง€ ์•Š์•˜๋Š”์ง€ ํ™•์ธ
Args:
file_path: ํŒŒ์ผ ๊ฒฝ๋กœ
Returns:
์ฒ˜๋ฆฌ ์—ฌ๋ถ€
"""
if file_path not in self.file_index:
return False
# ํ˜„์žฌ ํ•ด์‹œ๊ฐ’ ๊ณ„์‚ฐ
current_hash = self._calculate_file_hash(file_path)
# ์ €์žฅ๋œ ํ•ด์‹œ๊ฐ’๊ณผ ๋น„๊ต
if self.file_index[file_path]['hash'] != current_hash:
print(f"ํŒŒ์ผ ๋ณ€๊ฒฝ ๊ฐ์ง€: {file_path}")
return False
# ์ฒญํฌ ํŒŒ์ผ ์กด์žฌ ํ™•์ธ
chunks_path = self.file_index[file_path]['chunks_path']
if not os.path.exists(chunks_path):
return False
return True
def _get_chunks_path(self, file_hash: str) -> str:
"""
์ฒญํฌ ํŒŒ์ผ ๊ฒฝ๋กœ ์ƒ์„ฑ
Args:
file_hash: ํŒŒ์ผ ํ•ด์‹œ๊ฐ’
Returns:
์ฒญํฌ ํŒŒ์ผ ๊ฒฝ๋กœ
"""
return os.path.join(self.chunks_dir, f"{file_hash}.pkl")
def _save_chunks(self, file_path: str, chunks: List[Document]) -> None:
"""
์ฒญํฌ ๋ฐ์ดํ„ฐ ์ €์žฅ
Args:
file_path: ์›๋ณธ ํŒŒ์ผ ๊ฒฝ๋กœ
chunks: ๋ฌธ์„œ ์ฒญํฌ ๋ฆฌ์ŠคํŠธ
"""
# ํ•ด์‹œ ๊ณ„์‚ฐ
file_hash = self._calculate_file_hash(file_path)
# ์ฒญํฌ ํŒŒ์ผ ๊ฒฝ๋กœ
chunks_path = self._get_chunks_path(file_hash)
# ์ฒญํฌ ๋ฐ์ดํ„ฐ ์ €์žฅ
with open(chunks_path, 'wb') as f:
pickle.dump(chunks, f)
# ์ธ๋ฑ์Šค ์—…๋ฐ์ดํŠธ
self.file_index[file_path] = {
'hash': file_hash,
'chunks_path': chunks_path,
'last_processed': time.time(),
'chunks_count': len(chunks)
}
# ์ธ๋ฑ์Šค ์ €์žฅ
self._save_file_index()
print(f"์ฒญํฌ ์ €์žฅ ์™„๋ฃŒ: {file_path} ({len(chunks)}๊ฐœ ์ฒญํฌ)")
def _load_chunks(self, file_path: str) -> List[Document]:
"""
์ €์žฅ๋œ ์ฒญํฌ ๋ฐ์ดํ„ฐ ๋กœ๋“œ
Args:
file_path: ํŒŒ์ผ ๊ฒฝ๋กœ
Returns:
๋ฌธ์„œ ์ฒญํฌ ๋ฆฌ์ŠคํŠธ
"""
chunks_path = self.file_index[file_path]['chunks_path']
with open(chunks_path, 'rb') as f:
chunks = pickle.load(f)
print(f"์ฒญํฌ ๋กœ๋“œ ์™„๋ฃŒ: {file_path} ({len(chunks)}๊ฐœ ์ฒญํฌ)")
return chunks
def _process_pdf_file(self, file_path: str) -> List[Document]:
"""
PDF ํŒŒ์ผ ์ฒ˜๋ฆฌ - docling ์‹คํŒจ ์‹œ PyPDFLoader ์‚ฌ์šฉ
Args:
file_path: ์ฒ˜๋ฆฌํ•  PDF ํŒŒ์ผ ๊ฒฝ๋กœ
Returns:
์ฒ˜๋ฆฌ๋œ ๋ฌธ์„œ ์ฒญํฌ ๋ฆฌ์ŠคํŠธ
"""
try:
print(f"docling์œผ๋กœ ์ฒ˜๋ฆฌ ์‹œ๋„: {file_path}")
# docling ์‚ฌ์šฉ ์‹œ๋„
try:
# 10์ดˆ ํƒ€์ž„์•„์›ƒ ์„ค์ • (์˜ต์…˜)
import signal
def timeout_handler(signum, frame):
raise TimeoutError("docling ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ์ดˆ๊ณผ")
# ๋ฆฌ๋ˆ…์Šค/๋งฅ์—์„œ๋งŒ ์ž‘๋™ (์œˆ๋„์šฐ์—์„œ๋Š” ๋ฌด์‹œ๋จ)
try:
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(60) # 60์ดˆ ํƒ€์ž„์•„์›ƒ
except:
pass
# docling์œผ๋กœ ์ฒ˜๋ฆฌ ์‹œ๋„
chunks = self.document_processor.process_pdf(file_path, use_docling=True)
# ํƒ€์ž„์•„์›ƒ ์ทจ์†Œ
try:
signal.alarm(0)
except:
pass
return chunks
except Exception as e:
# docling ์˜ค๋ฅ˜ ํ™•์ธ
error_str = str(e)
if "Invalid code point" in error_str or "RuntimeError" in error_str:
print(f"docling ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜ (์ฝ”๋“œ ํฌ์ธํŠธ ๋ฌธ์ œ): {error_str}")
print("PyPDFLoader๋กœ ๋Œ€์ฒดํ•ฉ๋‹ˆ๋‹ค.")
else:
print(f"docling ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {error_str}")
print("PyPDFLoader๋กœ ๋Œ€์ฒดํ•ฉ๋‹ˆ๋‹ค.")
# PyPDFLoader๋กœ ๋Œ€์ฒด
try:
return self.document_processor.process_pdf(file_path, use_docling=False)
except Exception as inner_e:
print(f"PyPDFLoader ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {inner_e}")
raise # ๋‘ ๋ฐฉ๋ฒ• ๋ชจ๋‘ ์‹คํŒจํ•˜๋ฉด ์˜ˆ์™ธ ๋ฐœ์ƒ
except Exception as e:
print(f"PDF ์ฒ˜๋ฆฌ ์ค‘ ์‹ฌ๊ฐํ•œ ์˜ค๋ฅ˜: {e}")
# ๋นˆ ์ฒญํฌ๋ผ๋„ ๋ฐ˜ํ™˜ํ•˜์—ฌ ์ „์ฒด ์ฒ˜๋ฆฌ๊ฐ€ ์ค‘๋‹จ๋˜์ง€ ์•Š๋„๋ก ํ•จ
return []
def auto_process_documents(self) -> str:
"""
documents ํด๋”์˜ PDF ํŒŒ์ผ ์ž๋™ ์ฒ˜๋ฆฌ
Returns:
์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ ๋ฉ”์‹œ์ง€
"""
try:
start_time = time.time()
# PDF ํŒŒ์ผ ๋ชฉ๋ก ์ˆ˜์ง‘
pdf_files = []
for filename in os.listdir(self.pdf_directory):
if filename.lower().endswith('.pdf'):
pdf_files.append(os.path.join(self.pdf_directory, filename))
if not pdf_files:
return f"'{self.pdf_directory}' ํด๋”์— PDF ํŒŒ์ผ์ด ์—†์Šต๋‹ˆ๋‹ค."
print(f"๋ฐœ๊ฒฌ๋œ PDF ํŒŒ์ผ: {len(pdf_files)}๊ฐœ")
# ํด๋” ๋‚ด PDF ํŒŒ์ผ ์ฒ˜๋ฆฌ
new_files = []
updated_files = []
cached_files = []
failed_files = []
all_chunks = []
for file_path in pdf_files:
if self._is_file_processed(file_path):
# ์บ์‹œ์—์„œ ์ฒญํฌ ๋กœ๋“œ
chunks = self._load_chunks(file_path)
all_chunks.extend(chunks)
cached_files.append(file_path)
self.processed_files.append(os.path.basename(file_path))
else:
# ์ƒˆ ํŒŒ์ผ ๋˜๋Š” ๋ณ€๊ฒฝ๋œ ํŒŒ์ผ ์ฒ˜๋ฆฌ
print(f"์ฒ˜๋ฆฌ ์ค‘: {file_path}")
try:
# ๊ฐœ์„ ๋œ PDF ์ฒ˜๋ฆฌ ๋ฉ”์„œ๋“œ ์‚ฌ์šฉ
chunks = self._process_pdf_file(file_path)
if chunks: # ์ฒญํฌ๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ์—๋งŒ ์ €์žฅ
# ์ฒญํฌ ์ €์žฅ
self._save_chunks(file_path, chunks)
all_chunks.extend(chunks)
if file_path in self.file_index:
updated_files.append(file_path)
else:
new_files.append(file_path)
self.processed_files.append(os.path.basename(file_path))
else:
print(f"'{file_path}' ์ฒ˜๋ฆฌ ์‹คํŒจ: ์ถ”์ถœ๋œ ์ฒญํฌ ์—†์Œ")
failed_files.append(file_path)
except Exception as e:
print(f"'{file_path}' ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {e}")
failed_files.append(file_path)
# ๋ชจ๋“  ์ฒญํฌ ์ €์žฅ
self.documents = all_chunks
processing_time = time.time() - start_time
print(f"๋ฌธ์„œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ: {len(all_chunks)}๊ฐœ ์ฒญํฌ, {processing_time:.2f}์ดˆ")
# ๋ฒกํ„ฐ ์ธ๋ฑ์Šค ์ €์žฅ ๊ฒฝ๋กœ ํ™•์ธ
if os.path.exists(self.vector_index_dir) and any(os.listdir(self.vector_index_dir)):
# ๊ธฐ์กด ๋ฒกํ„ฐ ์ธ๋ฑ์Šค ๋กœ๋“œ
try:
print("์ €์žฅ๋œ ๋ฒกํ„ฐ ์ธ๋ฑ์Šค ๋กœ๋“œ ์ค‘...")
vector_store_loaded = self.vector_store.load_local(self.vector_index_dir)
# ์ธ๋ฑ์Šค ๋กœ๋“œ ์„ฑ๊ณต ํ™•์ธ
if self.vector_store.vector_store is not None:
# ์ƒˆ ๋ฌธ์„œ๋‚˜ ๋ณ€๊ฒฝ๋œ ๋ฌธ์„œ๊ฐ€ ์žˆ์œผ๋ฉด ์ธ๋ฑ์Šค ์—…๋ฐ์ดํŠธ
if new_files or updated_files:
print("๋ฒกํ„ฐ ์ธ๋ฑ์Šค ์—…๋ฐ์ดํŠธ ์ค‘...")
self.vector_store.add_documents(self.documents)
print("๋ฒกํ„ฐ ์ธ๋ฑ์Šค ๋กœ๋“œ ์™„๋ฃŒ")
else:
print("๋ฒกํ„ฐ ์ธ๋ฑ์Šค๋ฅผ ๋กœ๋“œํ–ˆ์œผ๋‚˜ ์œ ํšจํ•˜์ง€ ์•Š์Œ, ์ƒˆ๋กœ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.")
self.vector_store.create_or_load(self.documents)
except Exception as e:
print(f"๋ฒกํ„ฐ ์ธ๋ฑ์Šค ๋กœ๋“œ ์‹คํŒจ, ์ƒˆ๋กœ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค: {e}")
# ์˜ค๋ฅ˜ ์ƒ์„ธ ์ •๋ณด ์ถœ๋ ฅ
import traceback
traceback.print_exc()
# ์ƒˆ ๋ฒกํ„ฐ ์ธ๋ฑ์Šค ์ƒ์„ฑ
self.vector_store.create_or_load(self.documents)
else:
# ์ƒˆ ๋ฒกํ„ฐ ์ธ๋ฑ์Šค ์ƒ์„ฑ
print("์ƒˆ ๋ฒกํ„ฐ ์ธ๋ฑ์Šค ์ƒ์„ฑ ์ค‘...")
self.vector_store.create_or_load(self.documents)
# ๋ฒกํ„ฐ ์ธ๋ฑ์Šค ์ €์žฅ
if self.vector_store and self.vector_store.vector_store is not None:
try:
print(f"๋ฒกํ„ฐ ์ธ๋ฑ์Šค ์ €์žฅ ์ค‘: {self.vector_index_dir}")
save_result = self.vector_store.save_local(self.vector_index_dir)
print(f"๋ฒกํ„ฐ ์ธ๋ฑ์Šค ์ €์žฅ ์™„๋ฃŒ: {self.vector_index_dir}")
except Exception as e:
print(f"๋ฒกํ„ฐ ์ธ๋ฑ์Šค ์ €์žฅ ์‹คํŒจ: {e}")
# ์˜ค๋ฅ˜ ์ƒ์„ธ ์ •๋ณด ์ถœ๋ ฅ
import traceback
traceback.print_exc()
else:
print("๋ฒกํ„ฐ ์ธ๋ฑ์Šค๊ฐ€ ์ดˆ๊ธฐํ™”๋˜์ง€ ์•Š์•„ ์ €์žฅํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
# RAG ์ฒด์ธ ์ดˆ๊ธฐํ™”
if RAG_CHAIN_AVAILABLE:
self.rag_chain = RAGChain(self.vector_store)
self.is_initialized = True
total_time = time.time() - start_time
status_message = (
f"๋ฌธ์„œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ!\n"
f"- ์ฒ˜๋ฆฌ๋œ ํŒŒ์ผ: {len(self.processed_files)}๊ฐœ\n"
f"- ์บ์‹œ๋œ ํŒŒ์ผ: {len(cached_files)}๊ฐœ\n"
f"- ์ƒˆ ํŒŒ์ผ: {len(new_files)}๊ฐœ\n"
f"- ์—…๋ฐ์ดํŠธ๋œ ํŒŒ์ผ: {len(updated_files)}๊ฐœ\n"
f"- ์‹คํŒจํ•œ ํŒŒ์ผ: {len(failed_files)}๊ฐœ\n"
f"- ์ด ์ฒญํฌ ์ˆ˜: {len(self.documents)}๊ฐœ\n"
f"- ์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {total_time:.2f}์ดˆ\n"
f"์ด์ œ ์งˆ๋ฌธํ•  ์ค€๋น„๊ฐ€ ๋˜์—ˆ์Šต๋‹ˆ๋‹ค!"
)
print(status_message)
return status_message
else:
return "RAG ์ฒด์ธ์„ ์ดˆ๊ธฐํ™”ํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๊ฐ€ ์„ค์น˜๋˜์–ด ์žˆ๋Š”์ง€ ํ™•์ธํ•˜์„ธ์š”."
except Exception as e:
error_message = f"๋ฌธ์„œ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
print(error_message)
import traceback
traceback.print_exc()
return error_message
def reset_cache(self) -> str:
"""
์บ์‹œ ์ดˆ๊ธฐํ™”
Returns:
๊ฒฐ๊ณผ ๋ฉ”์‹œ์ง€
"""
try:
# ์ฒญํฌ ํŒŒ์ผ ์‚ญ์ œ
for filename in os.listdir(self.chunks_dir):
file_path = os.path.join(self.chunks_dir, filename)
if os.path.isfile(file_path):
os.remove(file_path)
# ์ธ๋ฑ์Šค ์ดˆ๊ธฐํ™”
self.file_index = {}
self._save_file_index()
# ๋ฒกํ„ฐ ์ธ๋ฑ์Šค ์‚ญ์ œ
for filename in os.listdir(self.vector_index_dir):
file_path = os.path.join(self.vector_index_dir, filename)
if os.path.isfile(file_path):
os.remove(file_path)
self.documents = []
self.processed_files = []
self.is_initialized = False
return "์บ์‹œ๊ฐ€ ์ดˆ๊ธฐํ™”๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ๋‹ค์Œ ์‹คํ–‰ ์‹œ ๋ชจ๋“  ๋ฌธ์„œ๊ฐ€ ๋‹ค์‹œ ์ฒ˜๋ฆฌ๋ฉ๋‹ˆ๋‹ค."
except Exception as e:
return f"์บ์‹œ ์ดˆ๊ธฐํ™” ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
def process_query(self, query: str, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
"""
์‚ฌ์šฉ์ž ์ฟผ๋ฆฌ ์ฒ˜๋ฆฌ
Args:
query: ์‚ฌ์šฉ์ž ์งˆ๋ฌธ
chat_history: ๋Œ€ํ™” ๊ธฐ๋ก
Returns:
์‘๋‹ต ๋ฐ ์—…๋ฐ์ดํŠธ๋œ ๋Œ€ํ™” ๊ธฐ๋ก
"""
if not query: # ๋น„์–ด์žˆ๋Š” ์ฟผ๋ฆฌ ์ฒ˜๋ฆฌ
return "", chat_history
if not self.is_initialized:
response = "๋ฌธ์„œ ๋กœ๋“œ๊ฐ€ ์ดˆ๊ธฐํ™”๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ์ž๋™ ๋กœ๋“œ๋ฅผ ์‹œ๋„ํ•ฉ๋‹ˆ๋‹ค."
chat_history.append((query, response))
# ์ž๋™ ๋กœ๋“œ ์‹œ๋„
try:
self.auto_process_documents()
if not self.is_initialized:
response = "๋ฌธ์„œ๋ฅผ ๋กœ๋“œํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. 'documents' ํด๋”์— PDF ํŒŒ์ผ์ด ์žˆ๋Š”์ง€ ํ™•์ธํ•˜์„ธ์š”."
chat_history.append((query, response))
return "", chat_history
except Exception as e:
response = f"๋ฌธ์„œ ๋กœ๋“œ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
chat_history.append((query, response))
return "", chat_history
try:
# RAG ์ฒด์ธ ์‹คํ–‰ ๋ฐ ์‘๋‹ต ์ƒ์„ฑ
start_time = time.time()
response = self.rag_chain.run(query)
end_time = time.time()
query_time = end_time - start_time
print(f"์ฟผ๋ฆฌ ์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {query_time:.2f}์ดˆ")
chat_history.append((query, response))
return "", chat_history
except Exception as e:
error_msg = f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
chat_history.append((query, error_msg))
return "", chat_history
def process_voice_query(self, audio, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
"""
์Œ์„ฑ ์ฟผ๋ฆฌ ์ฒ˜๋ฆฌ
Args:
audio: ๋…น์Œ๋œ ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ
chat_history: ๋Œ€ํ™” ๊ธฐ๋ก
Returns:
์‘๋‹ต ๋ฐ ์—…๋ฐ์ดํŠธ๋œ ๋Œ€ํ™” ๊ธฐ๋ก
"""
if audio is None:
return "", chat_history
try:
# ์ž„์‹œ ํŒŒ์ผ์— ์˜ค๋””์˜ค ์ €์žฅ
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
temp_path = temp_file.name
temp_file.write(audio)
print(f"[STT] ์ž„์‹œ ์˜ค๋””์˜ค ํŒŒ์ผ ์ƒ์„ฑ: {temp_path}")
# STT ์‹คํ–‰
result = self.stt_client.recognize_file(temp_path)
# ์ž„์‹œ ํŒŒ์ผ ์‚ญ์ œ
try:
os.unlink(temp_path)
print("[STT] ์ž„์‹œ ์˜ค๋””์˜ค ํŒŒ์ผ ์‚ญ์ œ๋จ")
except Exception as e:
print(f"[STT] ์ž„์‹œ ํŒŒ์ผ ์‚ญ์ œ ์‹คํŒจ: {e}")
# STT ๊ฒฐ๊ณผ ์ฒ˜๋ฆฌ
if "error" in result:
error_msg = f"์Œ์„ฑ์ธ์‹ ์˜ค๋ฅ˜: {result.get('error')}"
print(f"[STT] {error_msg}")
chat_history.append(("์Œ์„ฑ ๋ฉ”์‹œ์ง€", error_msg))
return "", chat_history
# ์ธ์‹๋œ ํ…์ŠคํŠธ ์ถ”์ถœ
recognized_text = result.get("text", "")
if not recognized_text:
error_msg = "์Œ์„ฑ์„ ์ธ์‹ํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”."
print("[STT] ์ธ์‹๋œ ํ…์ŠคํŠธ ์—†์Œ")
chat_history.append(("์Œ์„ฑ ๋ฉ”์‹œ์ง€", error_msg))
return "", chat_history
print(f"[STT] ์ธ์‹๋œ ํ…์ŠคํŠธ: {recognized_text}")
# ์ธ์‹๋œ ํ…์ŠคํŠธ๋กœ ์ฟผ๋ฆฌ ์ฒ˜๋ฆฌ (์Œ์„ฑ ๋ฉ”์‹œ์ง€ ์ ‘๋‘์–ด ์ถ”๊ฐ€)
return self.process_query(f"๐ŸŽค {recognized_text}", chat_history)
except Exception as e:
error_msg = f"์Œ์„ฑ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
print(f"[STT] {error_msg}")
chat_history.append(("์Œ์„ฑ ๋ฉ”์‹œ์ง€", error_msg))
return "", chat_history
def launch_app(self) -> None:
"""
์Œ์„ฑ์ธ์‹ ๊ธฐ๋Šฅ์ด ์ถ”๊ฐ€๋œ Gradio ์•ฑ ์‹คํ–‰
"""
import gradio as gr
with gr.Blocks(title="์Œ์„ฑ์ธ์‹ ๊ธฐ๋Šฅ์ด ์ถ”๊ฐ€๋œ PDF ๋ฌธ์„œ ๊ธฐ๋ฐ˜ RAG ์ฑ—๋ด‡") as app:
gr.Markdown("# ์Œ์„ฑ์ธ์‹ ๊ธฐ๋Šฅ์ด ์ถ”๊ฐ€๋œ PDF ๋ฌธ์„œ ๊ธฐ๋ฐ˜ RAG ์ฑ—๋ด‡")
gr.Markdown(f"* ์‚ฌ์šฉ ์ค‘์ธ LLM ๋ชจ๋ธ: **{LLM_MODEL}**")
gr.Markdown(f"* PDF ๋ฌธ์„œ ํด๋”: **{self.pdf_directory}**")
gr.Markdown("* ๋„ค์ด๋ฒ„ ํด๋กœ๋ฐ” ์Œ์„ฑ์ธ์‹ API ํ†ตํ•ฉ")
with gr.Row():
with gr.Column(scale=1):
# ๋ฌธ์„œ ์ƒํƒœ ์„น์…˜
status_box = gr.Textbox(
label="๋ฌธ์„œ ์ฒ˜๋ฆฌ ์ƒํƒœ",
value=f"์ฒ˜๋ฆฌ๋œ ๋ฌธ์„œ ({len(self.processed_files)}๊ฐœ): {', '.join(self.processed_files)}",
lines=5,
interactive=False
)
# ์บ์‹œ ๊ด€๋ฆฌ ๋ฒ„ํŠผ
refresh_button = gr.Button("๋ฌธ์„œ ์ƒˆ๋กœ ์ฝ๊ธฐ", variant="primary")
reset_button = gr.Button("์บ์‹œ ์ดˆ๊ธฐํ™”", variant="stop")
# ์ฒ˜๋ฆฌ๋œ ํŒŒ์ผ ์ •๋ณด
with gr.Accordion("์บ์‹œ ์„ธ๋ถ€ ์ •๋ณด", open=False):
file_info = ""
for file_path, info in self.file_index.items():
file_info += f"- {os.path.basename(file_path)}: {info['chunks_count']}๊ฐœ ์ฒญํฌ\n"
cache_info = gr.Textbox(
label="์บ์‹œ๋œ ํŒŒ์ผ ์ •๋ณด",
value=file_info or "์บ์‹œ๋œ ํŒŒ์ผ์ด ์—†์Šต๋‹ˆ๋‹ค.",
lines=5,
interactive=False
)
with gr.Column(scale=2):
# ์ฑ„ํŒ… ์ธํ„ฐํŽ˜์ด์Šค
chatbot = gr.Chatbot(
label="๋Œ€ํ™” ๋‚ด์šฉ",
bubble_full_width=False,
height=500,
show_copy_button=True
)
with gr.Tabs() as input_tabs:
# ํ…์ŠคํŠธ ์ž…๋ ฅ ํƒญ
with gr.Tab("ํ…์ŠคํŠธ ์ž…๋ ฅ"):
# ํ…์ŠคํŠธ ์ž…๋ ฅ๊ณผ ์ „์†ก ๋ฒ„ํŠผ์„ ์ˆ˜ํ‰์œผ๋กœ ๋ฐฐ์น˜
with gr.Row():
query_box = gr.Textbox(
label="์งˆ๋ฌธ",
placeholder="์ฒ˜๋ฆฌ๋œ ๋ฌธ์„œ ๋‚ด์šฉ์— ๋Œ€ํ•ด ์งˆ๋ฌธํ•˜์„ธ์š”...",
lines=2,
scale=4
)
submit_btn = gr.Button("์ „์†ก", variant="primary", scale=1)
# ์Œ์„ฑ ์ž…๋ ฅ ํƒญ
with gr.Tab("์Œ์„ฑ ์ž…๋ ฅ"):
audio_input = gr.Audio(
label="๋งˆ์ดํฌ ์ž…๋ ฅ",
sources=["microphone"],
type="bytes",
format="wav"
)
voice_submit_btn = gr.Button("์Œ์„ฑ ์งˆ๋ฌธ ์ „์†ก", variant="primary")
clear_chat_button = gr.Button("๋Œ€ํ™” ์ดˆ๊ธฐํ™”")
# ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ ์„ค์ •
refresh_button.click(
fn=self.auto_process_documents,
inputs=[],
outputs=[status_box]
)
reset_button.click(
fn=lambda: (self.reset_cache(), self.auto_process_documents()),
inputs=[],
outputs=[status_box]
)
# ํ…์ŠคํŠธ ์ „์†ก ๋ฒ„ํŠผ ํด๋ฆญ ์ด๋ฒคํŠธ
submit_btn.click(
fn=self.process_query,
inputs=[query_box, chatbot],
outputs=[query_box, chatbot]
)
# ์—”ํ„ฐํ‚ค ์ž…๋ ฅ ์ด๋ฒคํŠธ
query_box.submit(
fn=self.process_query,
inputs=[query_box, chatbot],
outputs=[query_box, chatbot]
)
# ์Œ์„ฑ ์ „์†ก ๋ฒ„ํŠผ ํด๋ฆญ ์ด๋ฒคํŠธ
voice_submit_btn.click(
fn=self.process_voice_query,
inputs=[audio_input, chatbot],
outputs=[audio_input, chatbot]
)
# ๋Œ€ํ™” ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
clear_chat_button.click(
fn=lambda: [],
outputs=[chatbot]
)
# ์•ฑ ์‹คํ–‰
app.launch(share=False)
if __name__ == "__main__":
app = VoiceRAGChatApp()
app.launch_app()