Spaces:
Paused
Paused
import zipfile | |
from typing import List | |
from langchain_community.document_loaders import PyMuPDFLoader | |
from langchain.schema import Document | |
from docx import Document as DocxDocument | |
import os | |
# Define constants | |
REFERENCE_DOCUMENT_PATH = './Excel Review.pdf' | |
USER_FILES_DIR = os.getenv('CHAINLIT_USER_FILES_DIR', '/tmp/chainlit_user_files') | |
# Ensure the user files directory exists | |
os.makedirs(USER_FILES_DIR, exist_ok=True) | |
def unzip_file(file_path: str, output_dir: str): | |
with zipfile.ZipFile(file_path, 'r') as zip_ref: | |
for member in zip_ref.namelist(): | |
if not member.startswith('__MACOSX/'): | |
zip_ref.extract(member, output_dir) | |
def read_pdf(file_path: str) -> List[Document]: | |
loader = PyMuPDFLoader(file_path) | |
return loader.load() | |
def read_docx(file_path: str) -> Document: | |
doc = DocxDocument(file_path) | |
text = "\n".join([p.text for p in doc.paragraphs]) | |
return Document(page_content=text, metadata={"source": file_path}) | |
def read_files_from_directory(directory: str) -> List[Document]: | |
documents = [] | |
for root, _, files in os.walk(directory): | |
for file in files: | |
file_path = os.path.join(root, file) | |
if os.path.basename(file_path).startswith('~$'): | |
continue # Skip temporary files | |
if file_path.endswith('.docx'): | |
documents.append(read_docx(file_path)) | |
elif file_path.endswith('.pdf'): | |
documents.extend(read_pdf(file_path)) | |
return documents | |
# Read file from user | |
def prepare_files(zip_file_name: str): | |
zip_file_path = os.path.join(USER_FILES_DIR, zip_file_name) | |
unzip_dir = os.path.join(USER_FILES_DIR, os.path.splitext(zip_file_name)[0]) | |
unzip_file(zip_file_path, unzip_dir) | |
documents = read_files_from_directory(unzip_dir) | |
reference_document = read_pdf(REFERENCE_DOCUMENT_PATH) | |
print("Your file", zip_file_name, "has been successfully unzipped") | |
return documents, reference_document |