|
|
""" |
|
|
RAG μ 체 νμ΄νλΌμΈ μ€ν μ€ν¬λ¦½νΈ |
|
|
|
|
|
λ¨κ³: |
|
|
1. μ μ²λ¦¬ (preprocess): ν
μ€νΈ μΆμΆ β μ μ β μ²νΉ |
|
|
2. μλ² λ© (embed): μ²ν¬ 벑ν°ν β ChromaDB μ μ₯ |
|
|
3. RAG (rag): RAG νμ΄νλΌμΈ ν
μ€νΈ (μ ν) |
|
|
|
|
|
μ¬μ©λ²: |
|
|
python main.py --step all # μ 체 μ€ν |
|
|
python main.py --step preprocess # μ μ²λ¦¬λ§ |
|
|
python main.py --step embed # μλ² λ©λ§ |
|
|
python main.py --step rag # RAG ν
μ€νΈλ§ |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
from src.utils.config import PreprocessConfig |
|
|
from src.loader.preprocess_pipeline import RAGPreprocessPipeline |
|
|
|
|
|
|
|
|
def parse_arguments(): |
|
|
"""컀맨λ λΌμΈ μΈμ νμ±""" |
|
|
parser = argparse.ArgumentParser( |
|
|
description='RAG μ 체 νμ΄νλΌμΈ μ€ν', |
|
|
formatter_class=argparse.RawDescriptionHelpFormatter, |
|
|
epilog=""" |
|
|
μμ: |
|
|
python main.py --step all # μ 체 νμ΄νλΌμΈ μ€ν |
|
|
python main.py --step preprocess # μ μ²λ¦¬λ§ μ€ν |
|
|
python main.py --step embed # μλ² λ©λ§ μ€ν |
|
|
python main.py --step rag --query "μ§λ¬Έ" # RAG ν
μ€νΈ |
|
|
|
|
|
python main.py --step preprocess --chunk-size 500 # μ²ν¬ ν¬κΈ° μ‘°μ |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
'--step', |
|
|
type=str, |
|
|
choices=['all', 'preprocess', 'embed', 'rag'], |
|
|
default='all', |
|
|
help='μ€νν λ¨κ³ (κΈ°λ³Έκ°: all)' |
|
|
) |
|
|
|
|
|
|
|
|
preprocess_group = parser.add_argument_group('μ μ²λ¦¬ μ΅μ
') |
|
|
preprocess_group.add_argument( |
|
|
'--meta-csv', |
|
|
type=str, |
|
|
default='./data/data_list.csv', |
|
|
help='λ©νλ°μ΄ν° CSV νμΌ κ²½λ‘' |
|
|
) |
|
|
preprocess_group.add_argument( |
|
|
'--files-dir', |
|
|
type=str, |
|
|
default='./data/files/', |
|
|
help='μλ³Έ νμΌ ν΄λ κ²½λ‘' |
|
|
) |
|
|
preprocess_group.add_argument( |
|
|
'--output-chunks', |
|
|
type=str, |
|
|
default='./data/rag_chunks_final_small.csv', |
|
|
help='μ²ν¬ μΆλ ₯ νμΌ κ²½λ‘' |
|
|
) |
|
|
preprocess_group.add_argument( |
|
|
'--chunk-size', |
|
|
type=int, |
|
|
default=1000, |
|
|
help='μ²ν¬ ν¬κΈ°' |
|
|
) |
|
|
preprocess_group.add_argument( |
|
|
'--chunk-overlap', |
|
|
type=int, |
|
|
default=200, |
|
|
help='μ²ν¬ μ€λ²λ©' |
|
|
) |
|
|
|
|
|
|
|
|
rag_group = parser.add_argument_group('RAG μ΅μ
') |
|
|
rag_group.add_argument( |
|
|
'--query', |
|
|
type=str, |
|
|
help='RAG μ§μ (rag λ¨κ³μμλ§ μ¬μ©)' |
|
|
) |
|
|
|
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def step_preprocess(args): |
|
|
"""1λ¨κ³: μ μ²λ¦¬ μ€ν""" |
|
|
print("\n" + "="*70) |
|
|
print("π§ 1λ¨κ³: λ°μ΄ν° μ μ²λ¦¬ μμ") |
|
|
print("="*70) |
|
|
|
|
|
|
|
|
config = PreprocessConfig() |
|
|
config.META_CSV_PATH = args.meta_csv |
|
|
config.BASE_FOLDER_PATH = args.files_dir |
|
|
config.OUTPUT_CHUNKS_PATH = args.output_chunks |
|
|
config.CHUNK_SIZE = args.chunk_size |
|
|
config.CHUNK_OVERLAP = args.chunk_overlap |
|
|
|
|
|
|
|
|
pipeline = RAGPreprocessPipeline(config) |
|
|
df_chunks = pipeline.run() |
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("β
1λ¨κ³: μ μ²λ¦¬ μλ£") |
|
|
print("="*70) |
|
|
print(f"π μΆλ ₯ νμΌ: {config.OUTPUT_CHUNKS_PATH}") |
|
|
print(f"π μ΄ μ²ν¬ μ: {len(df_chunks)}") |
|
|
|
|
|
return df_chunks |
|
|
|
|
|
|
|
|
def step_embed(args): |
|
|
"""2λ¨κ³: μλ² λ© λ° ChromaDB μ μ₯""" |
|
|
print("\n" + "="*70) |
|
|
print("π§ 2λ¨κ³: μλ² λ© λ° λ²‘ν°DB κ΅¬μΆ μμ") |
|
|
print("="*70) |
|
|
|
|
|
try: |
|
|
|
|
|
from src.embedding.rag_data_processing import RAGVectorDBPipeline |
|
|
|
|
|
|
|
|
pipeline = RAGVectorDBPipeline() |
|
|
vectorstore = pipeline.build() |
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("β
2λ¨κ³: μλ² λ© μλ£") |
|
|
print("="*70) |
|
|
|
|
|
except ImportError as e: |
|
|
print(f"β οΈ μλ² λ© λͺ¨λμ μ°Ύμ μ μμ΅λλ€: {e}") |
|
|
print(" src/embedding/rag_data_processing.py νμΌμ΄ μλμ§ νμΈνμΈμ.") |
|
|
sys.exit(1) |
|
|
except Exception as e: |
|
|
print(f"β μλ² λ© μ€ν μ€ μ€λ₯ λ°μ: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
def step_rag(args): |
|
|
"""3λ¨κ³: RAG νμ΄νλΌμΈ ν
μ€νΈ""" |
|
|
print("\n" + "="*70) |
|
|
print("π§ 3λ¨κ³: RAG νμ΄νλΌμΈ ν
μ€νΈ") |
|
|
print("="*70) |
|
|
|
|
|
try: |
|
|
|
|
|
from src.generator.generator import RAGPipeline |
|
|
from src.utils.rag_config import RAGConfig |
|
|
|
|
|
|
|
|
config = RAGConfig() |
|
|
|
|
|
|
|
|
rag = RAGPipeline(config=config) |
|
|
|
|
|
|
|
|
if args.query: |
|
|
print(f"\nπ μ§μ: {args.query}") |
|
|
result = rag.generate_answer(args.query) |
|
|
|
|
|
print(f"\nπ¬ λ΅λ³:") |
|
|
print(result['answer']) |
|
|
print(f"\nπ μ°Έκ³ λ¬Έμ: {len(result.get('sources', []))}κ°") |
|
|
print(f"π’ ν ν° μ¬μ©: {result['usage']['total_tokens']}") |
|
|
else: |
|
|
print("\nβ οΈ --query μΈμκ° μμ΄ ν
μ€νΈ μ§μλ₯Ό 건λλλλ€.") |
|
|
print(" μμ: python main.py --step rag --query 'νμλνκ΅ νΉμ±ν μ¬μ
μ?'") |
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("β
3λ¨κ³: RAG νμ΄νλΌμΈ μλ£") |
|
|
print("="*70) |
|
|
|
|
|
except ImportError as e: |
|
|
print(f"β οΈ RAG λͺ¨λμ μ°Ύμ μ μμ΅λλ€: {e}") |
|
|
print(" src/generator/rag_pipeline.py νμΌμ΄ μλμ§ νμΈνμΈμ.") |
|
|
sys.exit(1) |
|
|
except Exception as e: |
|
|
print(f"β RAG μ€ν μ€ μ€λ₯ λ°μ: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""λ©μΈ μ€ν ν¨μ""" |
|
|
args = parse_arguments() |
|
|
|
|
|
print("="*70) |
|
|
print("π RAG μ 체 νμ΄νλΌμΈ") |
|
|
print("="*70) |
|
|
print(f"μ€ν λ¨κ³: {args.step}") |
|
|
|
|
|
try: |
|
|
if args.step == 'all': |
|
|
|
|
|
step_preprocess(args) |
|
|
step_embed(args) |
|
|
|
|
|
|
|
|
if args.query: |
|
|
step_rag(args) |
|
|
|
|
|
elif args.step == 'preprocess': |
|
|
step_preprocess(args) |
|
|
|
|
|
elif args.step == 'embed': |
|
|
step_embed(args) |
|
|
|
|
|
elif args.step == 'rag': |
|
|
step_rag(args) |
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("π λͺ¨λ μμ
μλ£!") |
|
|
print("="*70) |
|
|
|
|
|
except KeyboardInterrupt: |
|
|
print("\n\nβ οΈ μ¬μ©μμ μν΄ μ€λ¨λμμ΅λλ€.") |
|
|
sys.exit(1) |
|
|
except Exception as e: |
|
|
print(f"\nβ μ€λ₯ λ°μ: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |