Spaces:

hira880
/

ChatBot-10

Sleeping

ChatBot-10 / Chatbot /document_processing.py

Upload 23 files

faa2f0c verified almost 2 years ago

908 Bytes

	# document_processing.py
	from PyPDF2 import PdfReader
	from docx import Document as DocxDocument
	import os

	def extract_text_from_pdf(pdf_path):
	reader = PdfReader(pdf_path)
	text = ''
	for page in reader.pages:
	text += page.extract_text() + '\n'
	return text

	def extract_text_from_docx(docx_path):
	doc = DocxDocument(docx_path)
	text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
	return text

	def load_documents_from_directory(text_dir):
	documents = []
	for file in os.listdir(text_dir):
	file_path = os.path.join(text_dir, file)
	if file_path.endswith('.pdf'):
	text = extract_text_from_pdf(file_path)
	elif file_path.endswith('.docx'):
	text = extract_text_from_docx(file_path)
	else:
	continue
	documents.append({"text": text, "metadata": {"filename": file}})
	return documents