Spaces:
Sleeping
Sleeping
| # document_processing.py | |
| from PyPDF2 import PdfReader | |
| from docx import Document as DocxDocument | |
| import os | |
| def extract_text_from_pdf(pdf_path): | |
| reader = PdfReader(pdf_path) | |
| text = '' | |
| for page in reader.pages: | |
| text += page.extract_text() + '\n' | |
| return text | |
| def extract_text_from_docx(docx_path): | |
| doc = DocxDocument(docx_path) | |
| text = '\n'.join([paragraph.text for paragraph in doc.paragraphs]) | |
| return text | |
| def load_documents_from_directory(text_dir): | |
| documents = [] | |
| for file in os.listdir(text_dir): | |
| file_path = os.path.join(text_dir, file) | |
| if file_path.endswith('.pdf'): | |
| text = extract_text_from_pdf(file_path) | |
| elif file_path.endswith('.docx'): | |
| text = extract_text_from_docx(file_path) | |
| else: | |
| continue | |
| documents.append({"text": text, "metadata": {"filename": file}}) | |
| return documents | |