import zipfile from typing import List from langchain_community.document_loaders import PyMuPDFLoader from langchain.schema import Document from docx import Document as DocxDocument import os # Define constants REFERENCE_DOCUMENT_PATH = './Excel Review.pdf' USER_FILES_DIR = os.getenv('CHAINLIT_USER_FILES_DIR', '/tmp/chainlit_user_files') # Ensure the user files directory exists os.makedirs(USER_FILES_DIR, exist_ok=True) def unzip_file(file_path: str, output_dir: str): with zipfile.ZipFile(file_path, 'r') as zip_ref: for member in zip_ref.namelist(): if not member.startswith('__MACOSX/'): zip_ref.extract(member, output_dir) def read_pdf(file_path: str) -> List[Document]: loader = PyMuPDFLoader(file_path) return loader.load() def read_docx(file_path: str) -> Document: doc = DocxDocument(file_path) text = "\n".join([p.text for p in doc.paragraphs]) return Document(page_content=text, metadata={"source": file_path}) def read_files_from_directory(directory: str) -> List[Document]: documents = [] for root, _, files in os.walk(directory): for file in files: file_path = os.path.join(root, file) if os.path.basename(file_path).startswith('~$'): continue # Skip temporary files if file_path.endswith('.docx'): documents.append(read_docx(file_path)) elif file_path.endswith('.pdf'): documents.extend(read_pdf(file_path)) return documents # Read file from user def prepare_files(zip_file_name: str): zip_file_path = os.path.join(USER_FILES_DIR, zip_file_name) unzip_dir = os.path.join(USER_FILES_DIR, os.path.splitext(zip_file_name)[0]) unzip_file(zip_file_path, unzip_dir) documents = read_files_from_directory(unzip_dir) reference_document = read_pdf(REFERENCE_DOCUMENT_PATH) print("Your file", zip_file_name, "has been successfully unzipped") return documents, reference_document