File size: 2,201 Bytes
49202c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import logging
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyMuPDFLoader


class ProcessDocs:
    def __init__(self, file_path: str, language: str = 'en', file_type: str = 'txt'):
        self.file_path = file_path
        self.language = language
        self.file_type = file_type.lower()
        self.documents = []
        self.file_name = os.path.basename(self.file_path)
        self.page_stats = []

    def load_documents(self):
        try:
            if self.file_type == "pdf":
                loader = PyMuPDFLoader(self.file_path)
                self.documents = loader.load()
            elif self.file_type == "txt":
                with open(self.file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                self.documents = [Document(page_content=content, metadata={"page": 1, "file_path": self.file_path})]
            else:
                raise ValueError(f"Unsupported file type: {self.file_type}")
            logging.info(f"Loaded {len(self.documents)} documents from {self.file_name}.")
        except Exception as e:
            logging.error(f"Error loading document: {e}")
            raise
        return self.documents

    def get_full_text(self) -> str:
        if not self.documents:
            self.load_documents()
        # total_words = len(self.documents.page_content.split())
        # total_chars = len(self.documentspage_content)
        # total_tokens = len(tiktoken.tokenize(self.documents.page_content))
        # encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
        # total_tokens = len(encoding.encode(self.documents[0].page_content))

        # self.page_stats.append({"total_tokens": total_tokens})    
        return "\n".join(doc.page_content for doc in self.documents)

   
    def process(self) -> (str):

        self.load_documents()
        full_text = self.get_full_text()
        return full_text

    def get_page_stats(self) -> list:
        if not self.page_stats:
            logging.warning("page_stats is empty; run save_chunks() first.")
        return self.page_stats