Spaces:

saad810
/

prototype2

Paused

App Files Files Community

prototype2 / lib /process_docs.py

saad810

Upload 15 files

49202c5 verified over 1 year ago

Raw

History Blame Contribute Delete

2.2 kB

	import os
	import logging
	from langchain.docstore.document import Document
	from langchain_community.document_loaders import PyMuPDFLoader


	class ProcessDocs:
	def __init__(self, file_path: str, language: str = 'en', file_type: str = 'txt'):
	self.file_path = file_path
	self.language = language
	self.file_type = file_type.lower()
	self.documents = []
	self.file_name = os.path.basename(self.file_path)
	self.page_stats = []

	def load_documents(self):
	try:
	if self.file_type == "pdf":
	loader = PyMuPDFLoader(self.file_path)
	self.documents = loader.load()
	elif self.file_type == "txt":
	with open(self.file_path, "r", encoding="utf-8") as f:
	content = f.read()
	self.documents = [Document(page_content=content, metadata={"page": 1, "file_path": self.file_path})]
	else:
	raise ValueError(f"Unsupported file type: {self.file_type}")
	logging.info(f"Loaded {len(self.documents)} documents from {self.file_name}.")
	except Exception as e:
	logging.error(f"Error loading document: {e}")
	raise
	return self.documents

	def get_full_text(self) -> str:
	if not self.documents:
	self.load_documents()
	# total_words = len(self.documents.page_content.split())
	# total_chars = len(self.documentspage_content)
	# total_tokens = len(tiktoken.tokenize(self.documents.page_content))
	# encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
	# total_tokens = len(encoding.encode(self.documents[0].page_content))

	# self.page_stats.append({"total_tokens": total_tokens})
	return "\n".join(doc.page_content for doc in self.documents)


	def process(self) -> (str):

	self.load_documents()
	full_text = self.get_full_text()
	return full_text

	def get_page_stats(self) -> list:
	if not self.page_stats:
	logging.warning("page_stats is empty; run save_chunks() first.")
	return self.page_stats