prototype02 / lib /process_docs.py
saad810's picture
Upload 17 files
5bd21bc verified
Raw
History Blame Contribute Delete
2.2 kB
import os
import logging
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyMuPDFLoader
class ProcessDocs:
def __init__(self, file_path: str, language: str = 'en', file_type: str = 'txt'):
self.file_path = file_path
self.language = language
self.file_type = file_type.lower()
self.documents = []
self.file_name = os.path.basename(self.file_path)
self.page_stats = []
def load_documents(self):
try:
if self.file_type == "pdf":
loader = PyMuPDFLoader(self.file_path)
self.documents = loader.load()
elif self.file_type == "txt":
with open(self.file_path, "r", encoding="utf-8") as f:
content = f.read()
self.documents = [Document(page_content=content, metadata={"page": 1, "file_path": self.file_path})]
else:
raise ValueError(f"Unsupported file type: {self.file_type}")
logging.info(f"Loaded {len(self.documents)} documents from {self.file_name}.")
except Exception as e:
logging.error(f"Error loading document: {e}")
raise
return self.documents
def get_full_text(self) -> str:
if not self.documents:
self.load_documents()
# total_words = len(self.documents.page_content.split())
# total_chars = len(self.documentspage_content)
# total_tokens = len(tiktoken.tokenize(self.documents.page_content))
# encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
# total_tokens = len(encoding.encode(self.documents[0].page_content))
# self.page_stats.append({"total_tokens": total_tokens})
return "\n".join(doc.page_content for doc in self.documents)
def process(self) -> (str):
self.load_documents()
full_text = self.get_full_text()
return full_text
def get_page_stats(self) -> list:
if not self.page_stats:
logging.warning("page_stats is empty; run save_chunks() first.")
return self.page_stats