Spaces:

dobinyim
/

aie3-autograder

Paused

App Files Files Community

Dobin Yim commited on Jul 23, 2024

Commit

c97d8e1

1 Parent(s): 2444848

modular files

Browse files

Files changed (6) hide show

calcscore.py +42 -0
extractjson.py +14 -0
final.py +64 -247
prompt_templates.py +59 -0
promptsplitembed.py +33 -0
readfile.py +51 -0

calcscore.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+from promptsplitembed import create_prompt, create_qamodel
+from extractjson import extract_json
+def compute_cosine_similarity(reference_embeddings: dict, student_embeddings: dict) -> float:
+    similarity_results = {}
+    for key in reference_embeddings.keys():
+        if key not in student_embeddings:
+            similarity_results[key] = 0
+            continue
+        reference_vector = np.array(reference_embeddings[key]).reshape(1, -1)
+        student_vector = np.array(student_embeddings[key]).reshape(1, -1)
+        if reference_vector.shape[1] != student_vector.shape[1]:
+            min_dim = min(reference_vector.shape[1], student_vector.shape[1])
+            reference_vector = reference_vector[:, :min_dim]
+            student_vector = student_vector[:, :min_dim]
+        similarity = cosine_similarity(reference_vector, student_vector)[0][0]
+        similarity_results[key] = similarity
+    total_similarity = sum(similarity_results.values())
+    num_questions = len(similarity_results)
+    average_similarity = total_similarity / num_questions if num_questions else 0
+    return average_similarity
+def llm_similarity(answers, student_result, llm_score_prompt_template):
+    score_prompt = llm_score_prompt_template
+    qa_chat_model = create_qamodel(model="gpt-4o-mini", temperature=0)
+    score_prompt_template = create_prompt(score_prompt)
+    student_score_chain = score_prompt_template | qa_chat_model
+    student_score = student_score_chain.invoke({"source": answers, "student": student_result })
+    llm_score_tokens = student_score.usage_metadata["total_tokens"]
+    student_score = dict(extract_json(student_score)[0])
+    total_score = sum(student_score.values())
+    num_questions = len(student_score)
+    average_score = total_score / num_questions if num_questions else 0
+    return average_score, llm_score_tokens

extractjson.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import json
+import os
+from langchain_core.messages import AIMessage
+from typing import List, Dict, Tuple
+import re
+def extract_json(message: AIMessage) -> List[dict]:
+    text = message.content
+    pattern = r"```json(.*?)```"
+    matches = re.findall(pattern, text, re.DOTALL)
+    try:
+        return [json.loads(match.strip()) for match in matches]
+    except Exception:
+        raise ValueError(f"Failed to parse: {message}")

final.py CHANGED Viewed

@@ -9,8 +9,9 @@ ______
 import logging
 import sys
 import os
-import re
-import zipfile
 from typing import List, Dict, Tuple
 from dotenv import load_dotenv
 from langchain_community.document_loaders import PyMuPDFLoader
@@ -27,7 +28,11 @@ import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
 import chainlit as cl
 import asyncio
-import zipfile
 # Load environment variables
 load_dotenv()
@@ -37,133 +42,12 @@ openai.api_key = OPENAI_API_KEY
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Define constants
-REFERENCE_DOCUMENT_PATH = './Excel Review.pdf'
-UPLOAD_FOLDER = './uploads'
-TEMP_DIR = "./temp"
-# Ensure the upload folder exists
-os.makedirs(UPLOAD_FOLDER, exist_ok=True)
-os.makedirs(TEMP_DIR, exist_ok=True)
-def unzip_file(file_path: str, output_dir: str):
-    with zipfile.ZipFile(file_path, 'r') as zip_ref:
-        for member in zip_ref.namelist():
-            if not member.startswith('__MACOSX/'):
-                zip_ref.extract(member, output_dir)
-def read_pdf(file_path: str) -> List[Document]:
-    loader = PyMuPDFLoader(file_path)
-    return loader.load()
-def read_docx(file_path: str) -> Document:
-    doc = DocxDocument(file_path)
-    text = "\n".join([p.text for p in doc.paragraphs])
-    return Document(page_content=text, metadata={"source": file_path})
-def read_files_from_directory(directory: str) -> List[Document]:
-    documents = []
-    for root, _, files in os.walk(directory):
-        for file in files:
-            file_path = os.path.join(root, file)
-            if os.path.basename(file_path).startswith('~$'):
-                continue  # Skip temporary files
-            if file_path.endswith('.docx'):
-                documents.append(read_docx(file_path))
-            elif file_path.endswith('.pdf'):
-                documents.extend(read_pdf(file_path))
-    return documents
-def extract_json(message: AIMessage) -> List[dict]:
-    text = message.content
-    pattern = r"```json(.*?)```"
-    matches = re.findall(pattern, text, re.DOTALL)
-    try:
-        return [json.loads(match.strip()) for match in matches]
-    except Exception:
-        raise ValueError(f"Failed to parse: {message}")
-qa_chat_model = ChatOpenAI(
-    model="gpt-4o-mini",
-    temperature=0
 )
-ref_prompt = f"""
-You are given a reference documents. The document contains a mix of instructions, guides, questions, and answers.
-Your task is to go through the reference document and extract questions and answers from the document step-by-step.
-Use the keyword 'Question #' to identify the start of each question.
-Retain the following words until the 'Answer:' as the question.
-Use the keyword 'Answer:' to identify the start of each answer.
-Retain the follwing words until the 'Question:' as the answer, until the end of the document.
-Remove any white spaces such as carriage returns.
-Return the question-answer pairs as a key-value pair as Dict type.
----
-Reference Document Content:
-{{source}}
-Please extract the question-answer pairs and return them as JSON.
-"""
-ref_prompt_template = ChatPromptTemplate.from_template(ref_prompt)
-ref_generation_chain = ref_prompt_template | qa_chat_model
-student_prompt = f"""
-You are given a student assignment document. The document may contain a mix of instructions, guides, questions, and answers.
-Your task is to go through the student document and extract answers to questions from the document step-by-step.
-Use the reference document as a guide.
-Use the keyword 'Question #' to identify each question.
-Then for its associated values, search the student document for the answer.
-If you do not see any answer in the student document, return 'No answer found'.
-Do not make up any answer.
-Remove any white spaces such as carriage returns.
-Return the original question and the student answer pairs as a key-value pair as Dict type.
----
-Reference Content:
-{{source}}
-Student Content:
-{{student}}
-Please extract the question-answer pairs and return them as JSON.
-"""
-student_prompt_template = ChatPromptTemplate.from_template(student_prompt)
-student_response_chain = student_prompt_template | qa_chat_model
-def split_documents(documents: List[Document]) -> List[Document]:
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=500,
-        chunk_overlap=100,
-        length_function=len,
-        is_separator_regex=False
-    )
-    split_docs = text_splitter.split_documents(documents)
-    total_tokens = sum(len(doc.page_content) for doc in split_docs)  # Approximate token count
-    return split_docs, total_tokens
-def generate_embeddings(docs: List[Document]) -> List[List[float]]:
-    embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
-    embeddings = embeddings_model.embed_documents([doc.page_content for doc in docs])
-    total_tokens = sum(len(doc.page_content) for doc in docs)  # Approximate token count
-    return embeddings, total_tokens
-def prepare_files(zip_file_name: str):
-    unzip_file(os.path.join(UPLOAD_FOLDER, zip_file_name), TEMP_DIR)
-    documents = read_files_from_directory(os.path.join(TEMP_DIR, os.path.splitext(zip_file_name)[0]))
-    reference_document = read_pdf(REFERENCE_DOCUMENT_PATH)
-    return documents, reference_document
-def process_student(documents, reference):
-    test_doc = documents[0]
-    student_result = student_response_chain.invoke({"source": reference.keys(),"student": test_doc })
-    student_gen_tokens = student_result.usage_metadata["total_tokens"]
-    student_result = dict(extract_json(student_result)[0])
-    return student_result, student_gen_tokens
-def process_reference(reference_document):
     result = ref_generation_chain.invoke({"source": reference_document})
     ref_gen_tokens = result.usage_metadata["total_tokens"]
     reference = dict(extract_json(result)[0])
@@ -174,120 +58,54 @@ def process_reference(reference_document):
             question_number = key.split('#')[1]
             answer_key = f'Answer #{question_number}'
             answers[key] = reference[answer_key]
     return reference, answers, ref_gen_tokens
-def split_docs(answers, student_result):
     split_reference_docs, ref_tokens = {}, 0
     split_student_docs, student_tokens = {}, 0
     for key, value in answers.items():
-        split_docs, tokens = split_documents([Document(page_content=value)])
-        split_reference_docs[key] = split_docs
         ref_tokens += tokens
     for key, value in student_result.items():
-        split_docs, tokens = split_documents([Document(page_content=value)])
-        split_student_docs[key] = split_docs
         student_tokens += tokens
     reference_embeddings = {key: generate_embeddings(value)[0] for key, value in split_reference_docs.items()}
     student_embeddings = {key: generate_embeddings(value)[0] for key, value in split_student_docs.items()}
     return reference_embeddings, student_embeddings, ref_tokens, student_tokens
-def compute_cosine_similarity(reference_embeddings: dict, student_embeddings: dict) -> float:
-    similarity_results = {}
-    for key in reference_embeddings.keys():
-        if key not in student_embeddings:
-            similarity_results[key] = 0
-            continue
-        reference_vector = np.array(reference_embeddings[key]).reshape(1, -1)
-        student_vector = np.array(student_embeddings[key]).reshape(1, -1)
-        if reference_vector.shape[1] != student_vector.shape[1]:
-            min_dim = min(reference_vector.shape[1], student_vector.shape[1])
-            reference_vector = reference_vector[:, :min_dim]
-            student_vector = student_vector[:, :min_dim]
-        similarity = cosine_similarity(reference_vector, student_vector)[0][0]
-        similarity_results[key] = similarity
-    total_similarity = sum(similarity_results.values())
-    num_questions = len(similarity_results)
-    average_similarity = total_similarity / num_questions if num_questions else 0
-    return average_similarity
-def llm_similarity(answers, student_result):
-    score_prompt = f"""
-    You are given two dictionaries representing instructor solution and student answers.
-    Your task is to go through each question to grade the correctness of student answer.
-    Use the keyword 'Question #' to identify each question.
-    Then for its associated values, compare student answer against the instructor answer.
-    If the instructor answer has numerical values, check to make sure the student answer has the same number,
-    whether it is expressed in numbers or text.
-    If you do not see any answer in the student answer, assign score 0 for that answer.
-    For student answer that is similar to instructor, assign a full score of 1.
-    If the student answer is similar enough, assign a partial score of 0.5.
-    Otherwise, assign a score of 0.
-    Return the original question and the student score pairs as a key-value pair as Dict type.
-    ---
-    Reference Content:
-    {{source}}
-    Student Content:
-    {{student}}
-    Please extract the question-answer pairs and return them as JSON.
-    """
-    score_prompt_template = ChatPromptTemplate.from_template(score_prompt)
-    student_score_chain = score_prompt_template | qa_chat_model
-    student_score = student_score_chain.invoke({"source": answers, "student": student_result })
-    llm_score_tokens = student_score.usage_metadata["total_tokens"]
-    student_score = dict(extract_json(student_score)[0])
-    total_score = sum(student_score.values())
-    num_questions = len(student_score)
-    average_score = total_score / num_questions if num_questions else 0
-    return average_score, llm_score_tokens
-def process_data(zip_file_name: str) -> Tuple[float, float, int, int, int]:
     documents, reference_document = prepare_files(zip_file_name)
-    reference, answers, ref_gen_tokens = process_reference(reference_document)
-    student_result, student_gen_tokens = process_student(documents, reference)
-    reference_embeddings, student_embeddings, ref_tokens, student_tokens = split_docs(answers, student_result)
     student_total_tokens = student_gen_tokens + student_tokens
     ref_total_tokens = ref_gen_tokens + ref_tokens
     average_similarity = compute_cosine_similarity(reference_embeddings, student_embeddings)
-    average_score, llm_score_tokens = llm_similarity(answers, student_result)
     llm_total_tokens = ref_gen_tokens + student_gen_tokens + llm_score_tokens
     return average_similarity, average_score, ref_total_tokens, student_total_tokens, llm_total_tokens
-async def process_grading():
-    global uploaded_file_name
-    if uploaded_file_name:
-        try:
-            # Process the uploaded ZIP file
-            average_similarity, average_score, ref_total_tokens, student_total_tokens, llm_total_tokens = process_data(uploaded_file_name)
-            # Send results
-            await cl.Message(content=f"Processing complete. Results:\n"
-                                     f"Average Similarity: {average_similarity:.2f}\n"
-                                     f"Average Score: {average_score:.2f}\n"
-                                     f"Reference Total Tokens: {ref_total_tokens}\n"
-                                     f"Student Total Tokens: {student_total_tokens}\n"
-                                     f"LLM Total Tokens: {llm_total_tokens}").send()
-        except Exception as e:
-            await cl.Message(f"An error occurred while processing the zip file: {str(e)}").send()
-    else:
-        await cl.Message("No file has been uploaded yet. Please upload a ZIP file first.").send()
 user_wants_to_continue = False
 @cl.on_chat_start
 async def start():
@@ -301,11 +119,11 @@ async def start():
         ).send()
     zip_file = files[0]  # Assuming only one file is uploaded
-    file_path = os.path.join(UPLOAD_FOLDER, zip_file.name)
     uploaded_file_name = zip_file.name
-    # Move the uploaded file to the desired location
-    os.rename(zip_file.path, file_path)
     # Let the user know that the system is ready
     await cl.Message(content=f"`{zip_file.name}` uploaded successfully!").send()
@@ -313,6 +131,25 @@ async def start():
     # Ask if the user wants to proceed with grading
     await cl.Message(content="Do you want to proceed with the grading? (yes/no)").send()
 @cl.on_message
 async def on_message(message: cl.Message):
     global user_wants_to_continue, uploaded_file_name
@@ -329,31 +166,11 @@ async def on_message(message: cl.Message):
         user_wants_to_continue = True
         await cl.Message(content="Do you want to continue? (yes/no)").send()
-    elif user_wants_to_continue:
-        if message.content.lower() == 'yes':
-            user_wants_to_continue = False
-            uploaded_file_name = None
-            await cl.Message(content="Restarting the app...").send()
-            await asyncio.sleep(1)
-            python = sys.executable
-            os.execl(python, python, *sys.argv)
-        elif message.content.lower() == 'no':
-            user_wants_to_continue = False
-            uploaded_file_name = None
-            await cl.Message(content="Okay, thank you for using the grading app. Restarting...").send()
-            await asyncio.sleep(1)
-            python = sys.executable
-            os.execl(python, python, *sys.argv)
-        else:
-            await cl.Message(content="Invalid response. Please type 'yes' or 'no'.").send()
-    elif message.content.lower() == 'no':
-        await cl.Message(content="Okay, thank you for using the grading app. Restarting...").send()
-        await asyncio.sleep(1)
-        python = sys.executable
-        os.execl(python, python, *sys.argv)
-    else:
-        await cl.Message(content="Please type 'yes' to start processing or 'no' to exit.").send()

 import logging
 import sys
 import os
+import asyncio
+import shutil
+from readfile import prepare_files, USER_FILES_DIR
 from typing import List, Dict, Tuple
 from dotenv import load_dotenv
 from langchain_community.document_loaders import PyMuPDFLoader
 from sklearn.metrics.pairwise import cosine_similarity
 import chainlit as cl
 import asyncio
+from readfile import prepare_files
+from promptsplitembed import create_prompt, split_documents, generate_embeddings, create_qamodel
+from extractjson import extract_json
+from calcscore import compute_cosine_similarity, llm_similarity
+from prompt_templates import ref_prompt, student_prompt, llm_score_prompt_template
 # Load environment variables
 load_dotenv()
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+qa_chat_model = create_qamodel(model="gpt-4o-mini", temperature=0
 )
+def process_reference(reference_document, ref_prompt):
+    ref_prompt_template = create_prompt(ref_prompt)
+    ref_generation_chain = ref_prompt_template | qa_chat_model
     result = ref_generation_chain.invoke({"source": reference_document})
     ref_gen_tokens = result.usage_metadata["total_tokens"]
     reference = dict(extract_json(result)[0])
             question_number = key.split('#')[1]
             answer_key = f'Answer #{question_number}'
             answers[key] = reference[answer_key]
+    print("Processed reference document")
     return reference, answers, ref_gen_tokens
+def process_student(documents, reference, student_prompt):
+    test_doc = documents[0]
+    student_prompt_template = create_prompt(student_prompt)
+    student_response_chain = student_prompt_template | qa_chat_model
+    student_result = student_response_chain.invoke({"source": reference.keys(),"student": test_doc })
+    student_gen_tokens = student_result.usage_metadata["total_tokens"]
+    student_result = dict(extract_json(student_result)[0])
+    print("Processed student document")
+    return student_result, student_gen_tokens
+def compare_docs(answers, student_result):
     split_reference_docs, ref_tokens = {}, 0
     split_student_docs, student_tokens = {}, 0
     for key, value in answers.items():
+        compare_docs, tokens = split_documents([Document(page_content=value)])
+        split_reference_docs[key] = compare_docs
         ref_tokens += tokens
     for key, value in student_result.items():
+        compare_docs, tokens = split_documents([Document(page_content=value)])
+        split_student_docs[key] = compare_docs
         student_tokens += tokens
     reference_embeddings = {key: generate_embeddings(value)[0] for key, value in split_reference_docs.items()}
     student_embeddings = {key: generate_embeddings(value)[0] for key, value in split_student_docs.items()}
+    print("Completed comparing student ans solution answers.")
     return reference_embeddings, student_embeddings, ref_tokens, student_tokens
+def process_data(zip_file_name: str, prompt_template) -> Tuple[float, float, int, int, int]:
     documents, reference_document = prepare_files(zip_file_name)
+    reference, answers, ref_gen_tokens = process_reference(reference_document, ref_prompt)
+    student_result, student_gen_tokens = process_student(documents, reference, student_prompt)
+    reference_embeddings, student_embeddings, ref_tokens, student_tokens = compare_docs(answers, student_result)
     student_total_tokens = student_gen_tokens + student_tokens
     ref_total_tokens = ref_gen_tokens + ref_tokens
     average_similarity = compute_cosine_similarity(reference_embeddings, student_embeddings)
+    average_score, llm_score_tokens = llm_similarity(answers, student_result, llm_score_prompt_template)
     llm_total_tokens = ref_gen_tokens + student_gen_tokens + llm_score_tokens
     return average_similarity, average_score, ref_total_tokens, student_total_tokens, llm_total_tokens
 user_wants_to_continue = False
+uploaded_file_name = None
 @cl.on_chat_start
 async def start():
         ).send()
     zip_file = files[0]  # Assuming only one file is uploaded
+    file_path = os.path.join(USER_FILES_DIR, zip_file.name)
     uploaded_file_name = zip_file.name
+    # Move the uploaded file to the user files directory
+    shutil.move(zip_file.path, file_path)
     # Let the user know that the system is ready
     await cl.Message(content=f"`{zip_file.name}` uploaded successfully!").send()
     # Ask if the user wants to proceed with grading
     await cl.Message(content="Do you want to proceed with the grading? (yes/no)").send()
+async def process_grading():
+    global uploaded_file_name
+    if uploaded_file_name:
+        try:
+            # Process the uploaded ZIP file
+            average_similarity, average_score, ref_total_tokens, student_total_tokens, llm_total_tokens = process_data(uploaded_file_name, llm_score_prompt_template)
+            # Send results
+            await cl.Message(content=f"Processing complete. Results:\n"
+                                     f"Average Similarity: {average_similarity:.2f}\n"
+                                     f"Average Score: {average_score:.2f}\n"
+                                     f"Reference Total Tokens: {ref_total_tokens}\n"
+                                     f"Student Total Tokens: {student_total_tokens}\n"
+                                     f"LLM Total Tokens: {llm_total_tokens}").send()
+        except Exception as e:
+            await cl.Message(content=f"An error occurred while processing the zip file: {str(e)}").send()
+    else:
+        await cl.Message(content="No file has been uploaded yet. Please upload a ZIP file first.").send()
 @cl.on_message
 async def on_message(message: cl.Message):
     global user_wants_to_continue, uploaded_file_name
         user_wants_to_continue = True
         await cl.Message(content="Do you want to continue? (yes/no)").send()
+    # ... rest of the function ...
+if __name__ == "__main__":
+    # Ensure the user files directory exists
+    os.makedirs(USER_FILES_DIR, exist_ok=True)
+    # Your Chainlit app setup and run code here
+    cl.run()

prompt_templates.py ADDED Viewed

	@@ -0,0 +1,59 @@

+ref_prompt = f"""
+You are given a reference documents. The document contains a mix of instructions, guides, questions, and answers.
+Your task is to go through the reference document and extract questions and answers from the document step-by-step.
+Use the keyword 'Question #' to identify the start of each question.
+Retain the following words until the 'Answer:' as the question.
+Use the keyword 'Answer:' to identify the start of each answer.
+Retain the follwing words until the 'Question:' as the answer, until the end of the document.
+Remove any white spaces such as carriage returns.
+Return the question-answer pairs as a key-value pair as Dict type.
+---
+Reference Document Content:
+{{source}}
+Please extract the question-answer pairs and return them as JSON.
+"""
+student_prompt = f"""
+You are given a student assignment document. The document may contain a mix of instructions, guides, questions, and answers.
+Your task is to go through the student document and extract answers to questions from the document step-by-step.
+Use the reference document as a guide.
+Use the keyword 'Question #' to identify each question.
+Then for its associated values, search the student document for the answer.
+If you do not see any answer in the student document, return 'No answer found'.
+Do not make up any answer.
+Remove any white spaces such as carriage returns.
+Return the original question and the student answer pairs as a key-value pair as Dict type.
+---
+Reference Content:
+{{source}}
+Student Content:
+{{student}}
+Please extract the question-answer pairs and return them as JSON.
+"""
+llm_score_prompt_template = f"""
+    You are given two dictionaries representing instructor solution and student answers.
+    Your task is to go through each question to grade the correctness of student answer.
+    Use the keyword 'Question #' to identify each question.
+    Then for its associated values, compare student answer against the instructor answer.
+    If the instructor answer has numerical values, check to make sure the student answer has the same number,
+    whether it is expressed in numbers or text.
+    If you do not see any answer in the student answer, assign score 0 for that answer.
+    For student answer that is similar to instructor, assign a full score of 1.
+    If the student answer is similar enough, assign a partial score of 0.5.
+    Otherwise, assign a score of 0.
+    Return the original question and the student score pairs as a key-value pair as Dict type.
+    ---
+    Reference Content:
+    {{source}}
+    Student Content:
+    {{student}}
+    Please extract the question-answer pairs and return them as JSON.
+    """

promptsplitembed.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import List, Dict, Tuple
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+def create_prompt(prompt):
+    prompt_template = ChatPromptTemplate.from_template(prompt)
+    return prompt_template
+def split_documents(documents: List[Document]) -> List[Document]:
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=100,
+        length_function=len,
+        is_separator_regex=False
+    )
+    split_docs = text_splitter.split_documents(documents)
+    total_tokens = sum(len(doc.page_content) for doc in split_docs)  # Approximate token count
+    return split_docs, total_tokens
+def generate_embeddings(docs: List[Document]) -> List[List[float]]:
+    embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
+    embeddings = embeddings_model.embed_documents([doc.page_content for doc in docs])
+    total_tokens = sum(len(doc.page_content) for doc in docs)  # Approximate token count
+    return embeddings, total_tokens
+def create_qamodel(model="gpt-4o-mini", temperature=0):
+    qamodel = ChatOpenAI(
+    model="gpt-4o-mini",
+    temperature=0
+)
+    return qamodel

readfile.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import zipfile
+from typing import List
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain.schema import Document
+from docx import Document as DocxDocument
+import os
+# Define constants
+REFERENCE_DOCUMENT_PATH = './Excel Review.pdf'
+USER_FILES_DIR = os.getenv('CHAINLIT_USER_FILES_DIR', '/tmp/chainlit_user_files')
+# Ensure the user files directory exists
+os.makedirs(USER_FILES_DIR, exist_ok=True)
+def unzip_file(file_path: str, output_dir: str):
+    with zipfile.ZipFile(file_path, 'r') as zip_ref:
+        for member in zip_ref.namelist():
+            if not member.startswith('__MACOSX/'):
+                zip_ref.extract(member, output_dir)
+def read_pdf(file_path: str) -> List[Document]:
+    loader = PyMuPDFLoader(file_path)
+    return loader.load()
+def read_docx(file_path: str) -> Document:
+    doc = DocxDocument(file_path)
+    text = "\n".join([p.text for p in doc.paragraphs])
+    return Document(page_content=text, metadata={"source": file_path})
+def read_files_from_directory(directory: str) -> List[Document]:
+    documents = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if os.path.basename(file_path).startswith('~$'):
+                continue  # Skip temporary files
+            if file_path.endswith('.docx'):
+                documents.append(read_docx(file_path))
+            elif file_path.endswith('.pdf'):
+                documents.extend(read_pdf(file_path))
+    return documents
+# Read file from user
+def prepare_files(zip_file_name: str):
+    zip_file_path = os.path.join(USER_FILES_DIR, zip_file_name)
+    unzip_dir = os.path.join(USER_FILES_DIR, os.path.splitext(zip_file_name)[0])
+    unzip_file(zip_file_path, unzip_dir)
+    documents = read_files_from_directory(unzip_dir)
+    reference_document = read_pdf(REFERENCE_DOCUMENT_PATH)
+    print("Your file", zip_file_name, "has been successfully unzipped")
+    return documents, reference_document