import os import logging from langchain.docstore.document import Document from langchain_community.document_loaders import PyMuPDFLoader class ProcessDocs: def __init__(self, file_path: str, language: str = 'en', file_type: str = 'txt'): self.file_path = file_path self.language = language self.file_type = file_type.lower() self.documents = [] self.file_name = os.path.basename(self.file_path) self.page_stats = [] def load_documents(self): try: if self.file_type == "pdf": loader = PyMuPDFLoader(self.file_path) self.documents = loader.load() elif self.file_type == "txt": with open(self.file_path, "r", encoding="utf-8") as f: content = f.read() self.documents = [Document(page_content=content, metadata={"page": 1, "file_path": self.file_path})] else: raise ValueError(f"Unsupported file type: {self.file_type}") logging.info(f"Loaded {len(self.documents)} documents from {self.file_name}.") except Exception as e: logging.error(f"Error loading document: {e}") raise return self.documents def get_full_text(self) -> str: if not self.documents: self.load_documents() # total_words = len(self.documents.page_content.split()) # total_chars = len(self.documentspage_content) # total_tokens = len(tiktoken.tokenize(self.documents.page_content)) # encoding = tiktoken.encoding_for_model('gpt-3.5-turbo') # total_tokens = len(encoding.encode(self.documents[0].page_content)) # self.page_stats.append({"total_tokens": total_tokens}) return "\n".join(doc.page_content for doc in self.documents) def process(self) -> (str): self.load_documents() full_text = self.get_full_text() return full_text def get_page_stats(self) -> list: if not self.page_stats: logging.warning("page_stats is empty; run save_chunks() first.") return self.page_stats