ChatBot-10 / Chatbot /document_processing.py
hira880's picture
Upload 23 files
faa2f0c verified
raw
history blame contribute delete
908 Bytes
# document_processing.py
from PyPDF2 import PdfReader
from docx import Document as DocxDocument
import os
def extract_text_from_pdf(pdf_path):
reader = PdfReader(pdf_path)
text = ''
for page in reader.pages:
text += page.extract_text() + '\n'
return text
def extract_text_from_docx(docx_path):
doc = DocxDocument(docx_path)
text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
return text
def load_documents_from_directory(text_dir):
documents = []
for file in os.listdir(text_dir):
file_path = os.path.join(text_dir, file)
if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path)
elif file_path.endswith('.docx'):
text = extract_text_from_docx(file_path)
else:
continue
documents.append({"text": text, "metadata": {"filename": file}})
return documents