Demos / backend /classes /data_preparer.py
nikhile-galileo's picture
Adding finance protect demo
e68d535
raw
history blame
2.11 kB
from typing import List, Any
from pydantic import BaseModel
from pathlib import Path
from backend.classes.pdf_extractor import PyMuPDFExtractor, PyMuPDFExtractorConfig
from backend.utils.utils import create_pdf_extractor
from backend.classes.pdf_extractor import BasePDFExtractorConfig
class DataPreparerConfig(BaseModel):
input_data_path: str
output_data_path: str
output_file: str
pdf_extractor: BasePDFExtractorConfig
class DataPreparer:
def __init__(self, config: DataPreparerConfig):
self.config = config
self.input_data_path = self.config.input_data_path
self.output_data_path = self.config.output_data_path
self.output_file = self.config.output_file
self.pdf_extractor_config = PyMuPDFExtractorConfig()
self.pdf_extractor = create_pdf_extractor(PyMuPDFExtractor, self.pdf_extractor_config)
def get_pdf_files(self) -> list:
# Get all pdf files from folder in a recursive manner using pathlib.Path
pdf_files = []
for path in Path(self.input_data_path).rglob("*.pdf"):
pdf_files.append(path)
return pdf_files
def save_data_to_jsonl(self, data: List[Any], file_path: str):
try:
# Save text to a file
with open(file_path, "w", encoding="utf-8") as f:
for entry in data:
f.write(entry.model_dump_json() + "\n")
except Exception as e:
print(f"Error saving data to file: {e}")
def prepare_data(self):
# Read pdf files from folder
pdf_files = self.get_pdf_files()
# Extract text from pdf files
for pdf_file in pdf_files:
# Extract pdf data in markdown
pdf_data = self.pdf_extractor.extract(pdf_file)
# Get file name and construct output file name
file_name = pdf_file.stem.replace(" ", "_")
output_file = self.output_file.format(file_name=file_name)
# Save pdf data to json
self.save_data_to_jsonl(pdf_data, str(Path(self.output_data_path) / output_file))