aie3-autograder / readfile.py
Dobin Yim
modular files
c97d8e1
import zipfile
from typing import List
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.schema import Document
from docx import Document as DocxDocument
import os
# Define constants
REFERENCE_DOCUMENT_PATH = './Excel Review.pdf'
USER_FILES_DIR = os.getenv('CHAINLIT_USER_FILES_DIR', '/tmp/chainlit_user_files')
# Ensure the user files directory exists
os.makedirs(USER_FILES_DIR, exist_ok=True)
def unzip_file(file_path: str, output_dir: str):
with zipfile.ZipFile(file_path, 'r') as zip_ref:
for member in zip_ref.namelist():
if not member.startswith('__MACOSX/'):
zip_ref.extract(member, output_dir)
def read_pdf(file_path: str) -> List[Document]:
loader = PyMuPDFLoader(file_path)
return loader.load()
def read_docx(file_path: str) -> Document:
doc = DocxDocument(file_path)
text = "\n".join([p.text for p in doc.paragraphs])
return Document(page_content=text, metadata={"source": file_path})
def read_files_from_directory(directory: str) -> List[Document]:
documents = []
for root, _, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
if os.path.basename(file_path).startswith('~$'):
continue # Skip temporary files
if file_path.endswith('.docx'):
documents.append(read_docx(file_path))
elif file_path.endswith('.pdf'):
documents.extend(read_pdf(file_path))
return documents
# Read file from user
def prepare_files(zip_file_name: str):
zip_file_path = os.path.join(USER_FILES_DIR, zip_file_name)
unzip_dir = os.path.join(USER_FILES_DIR, os.path.splitext(zip_file_name)[0])
unzip_file(zip_file_path, unzip_dir)
documents = read_files_from_directory(unzip_dir)
reference_document = read_pdf(REFERENCE_DOCUMENT_PATH)
print("Your file", zip_file_name, "has been successfully unzipped")
return documents, reference_document