Spaces:
Paused
Paused
| import os | |
| import logging | |
| from langchain.docstore.document import Document | |
| from langchain_community.document_loaders import PyMuPDFLoader | |
| class ProcessDocs: | |
| def __init__(self, file_path: str, language: str = 'en', file_type: str = 'txt'): | |
| self.file_path = file_path | |
| self.language = language | |
| self.file_type = file_type.lower() | |
| self.documents = [] | |
| self.file_name = os.path.basename(self.file_path) | |
| self.page_stats = [] | |
| def load_documents(self): | |
| try: | |
| if self.file_type == "pdf": | |
| loader = PyMuPDFLoader(self.file_path) | |
| self.documents = loader.load() | |
| elif self.file_type == "txt": | |
| with open(self.file_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| self.documents = [Document(page_content=content, metadata={"page": 1, "file_path": self.file_path})] | |
| else: | |
| raise ValueError(f"Unsupported file type: {self.file_type}") | |
| logging.info(f"Loaded {len(self.documents)} documents from {self.file_name}.") | |
| except Exception as e: | |
| logging.error(f"Error loading document: {e}") | |
| raise | |
| return self.documents | |
| def get_full_text(self) -> str: | |
| if not self.documents: | |
| self.load_documents() | |
| # total_words = len(self.documents.page_content.split()) | |
| # total_chars = len(self.documentspage_content) | |
| # total_tokens = len(tiktoken.tokenize(self.documents.page_content)) | |
| # encoding = tiktoken.encoding_for_model('gpt-3.5-turbo') | |
| # total_tokens = len(encoding.encode(self.documents[0].page_content)) | |
| # self.page_stats.append({"total_tokens": total_tokens}) | |
| return "\n".join(doc.page_content for doc in self.documents) | |
| def process(self) -> (str): | |
| self.load_documents() | |
| full_text = self.get_full_text() | |
| return full_text | |
| def get_page_stats(self) -> list: | |
| if not self.page_stats: | |
| logging.warning("page_stats is empty; run save_chunks() first.") | |
| return self.page_stats | |