| from abc import ABC, abstractmethod | |
| from src.rag.retrieval import DocDB | |
| class IRawDataProcessor(ABC): | |
| # take input file path of raw data and output structured query data | |
| def get_queries(self, input_file: str, output_file: str): | |
| pass | |
| # take structured input query file path of structured query data and output raw data | |
| def get_documents(self, query_file: str, output_file: str): | |
| pass | |
| class DatasetProcessor(ABC): | |
| """Base abstract class for dataset processors""" | |
| def process_queries(self, input_file: str, **kwargs) -> list: | |
| """Process queries from input file and return a list of formatted queries""" | |
| pass | |
| def process_documents(self, query_file: str, db: DocDB, **kwargs) -> dict: | |
| """Process documents for queries and return a dictionary of documents""" | |
| pass | |