Dobin Yim commited on
Commit
c97d8e1
·
1 Parent(s): 2444848

modular files

Browse files
Files changed (6) hide show
  1. calcscore.py +42 -0
  2. extractjson.py +14 -0
  3. final.py +64 -247
  4. prompt_templates.py +59 -0
  5. promptsplitembed.py +33 -0
  6. readfile.py +51 -0
calcscore.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.metrics.pairwise import cosine_similarity
2
+ import numpy as np
3
+ from promptsplitembed import create_prompt, create_qamodel
4
+ from extractjson import extract_json
5
+
6
+ def compute_cosine_similarity(reference_embeddings: dict, student_embeddings: dict) -> float:
7
+ similarity_results = {}
8
+ for key in reference_embeddings.keys():
9
+ if key not in student_embeddings:
10
+ similarity_results[key] = 0
11
+ continue
12
+ reference_vector = np.array(reference_embeddings[key]).reshape(1, -1)
13
+ student_vector = np.array(student_embeddings[key]).reshape(1, -1)
14
+ if reference_vector.shape[1] != student_vector.shape[1]:
15
+ min_dim = min(reference_vector.shape[1], student_vector.shape[1])
16
+ reference_vector = reference_vector[:, :min_dim]
17
+ student_vector = student_vector[:, :min_dim]
18
+ similarity = cosine_similarity(reference_vector, student_vector)[0][0]
19
+ similarity_results[key] = similarity
20
+
21
+ total_similarity = sum(similarity_results.values())
22
+ num_questions = len(similarity_results)
23
+ average_similarity = total_similarity / num_questions if num_questions else 0
24
+
25
+ return average_similarity
26
+
27
+ def llm_similarity(answers, student_result, llm_score_prompt_template):
28
+ score_prompt = llm_score_prompt_template
29
+ qa_chat_model = create_qamodel(model="gpt-4o-mini", temperature=0)
30
+
31
+ score_prompt_template = create_prompt(score_prompt)
32
+ student_score_chain = score_prompt_template | qa_chat_model
33
+
34
+ student_score = student_score_chain.invoke({"source": answers, "student": student_result })
35
+ llm_score_tokens = student_score.usage_metadata["total_tokens"]
36
+ student_score = dict(extract_json(student_score)[0])
37
+
38
+ total_score = sum(student_score.values())
39
+ num_questions = len(student_score)
40
+ average_score = total_score / num_questions if num_questions else 0
41
+
42
+ return average_score, llm_score_tokens
extractjson.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from langchain_core.messages import AIMessage
4
+ from typing import List, Dict, Tuple
5
+ import re
6
+
7
+ def extract_json(message: AIMessage) -> List[dict]:
8
+ text = message.content
9
+ pattern = r"```json(.*?)```"
10
+ matches = re.findall(pattern, text, re.DOTALL)
11
+ try:
12
+ return [json.loads(match.strip()) for match in matches]
13
+ except Exception:
14
+ raise ValueError(f"Failed to parse: {message}")
final.py CHANGED
@@ -9,8 +9,9 @@ ______
9
  import logging
10
  import sys
11
  import os
12
- import re
13
- import zipfile
 
14
  from typing import List, Dict, Tuple
15
  from dotenv import load_dotenv
16
  from langchain_community.document_loaders import PyMuPDFLoader
@@ -27,7 +28,11 @@ import numpy as np
27
  from sklearn.metrics.pairwise import cosine_similarity
28
  import chainlit as cl
29
  import asyncio
30
- import zipfile
 
 
 
 
31
 
32
  # Load environment variables
33
  load_dotenv()
@@ -37,133 +42,12 @@ openai.api_key = OPENAI_API_KEY
37
  # Set up logging
38
  logging.basicConfig(level=logging.INFO)
39
  logger = logging.getLogger(__name__)
40
-
41
- # Define constants
42
- REFERENCE_DOCUMENT_PATH = './Excel Review.pdf'
43
- UPLOAD_FOLDER = './uploads'
44
- TEMP_DIR = "./temp"
45
-
46
- # Ensure the upload folder exists
47
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
48
- os.makedirs(TEMP_DIR, exist_ok=True)
49
-
50
- def unzip_file(file_path: str, output_dir: str):
51
- with zipfile.ZipFile(file_path, 'r') as zip_ref:
52
- for member in zip_ref.namelist():
53
- if not member.startswith('__MACOSX/'):
54
- zip_ref.extract(member, output_dir)
55
-
56
- def read_pdf(file_path: str) -> List[Document]:
57
- loader = PyMuPDFLoader(file_path)
58
- return loader.load()
59
-
60
- def read_docx(file_path: str) -> Document:
61
- doc = DocxDocument(file_path)
62
- text = "\n".join([p.text for p in doc.paragraphs])
63
- return Document(page_content=text, metadata={"source": file_path})
64
-
65
- def read_files_from_directory(directory: str) -> List[Document]:
66
- documents = []
67
- for root, _, files in os.walk(directory):
68
- for file in files:
69
- file_path = os.path.join(root, file)
70
- if os.path.basename(file_path).startswith('~$'):
71
- continue # Skip temporary files
72
- if file_path.endswith('.docx'):
73
- documents.append(read_docx(file_path))
74
- elif file_path.endswith('.pdf'):
75
- documents.extend(read_pdf(file_path))
76
- return documents
77
-
78
- def extract_json(message: AIMessage) -> List[dict]:
79
- text = message.content
80
- pattern = r"```json(.*?)```"
81
- matches = re.findall(pattern, text, re.DOTALL)
82
- try:
83
- return [json.loads(match.strip()) for match in matches]
84
- except Exception:
85
- raise ValueError(f"Failed to parse: {message}")
86
 
87
- qa_chat_model = ChatOpenAI(
88
- model="gpt-4o-mini",
89
- temperature=0
90
  )
91
-
92
- ref_prompt = f"""
93
- You are given a reference documents. The document contains a mix of instructions, guides, questions, and answers.
94
- Your task is to go through the reference document and extract questions and answers from the document step-by-step.
95
- Use the keyword 'Question #' to identify the start of each question.
96
- Retain the following words until the 'Answer:' as the question.
97
- Use the keyword 'Answer:' to identify the start of each answer.
98
- Retain the follwing words until the 'Question:' as the answer, until the end of the document.
99
- Remove any white spaces such as carriage returns.
100
- Return the question-answer pairs as a key-value pair as Dict type.
101
- ---
102
-
103
- Reference Document Content:
104
- {{source}}
105
-
106
- Please extract the question-answer pairs and return them as JSON.
107
- """
108
-
109
- ref_prompt_template = ChatPromptTemplate.from_template(ref_prompt)
110
- ref_generation_chain = ref_prompt_template | qa_chat_model
111
-
112
- student_prompt = f"""
113
- You are given a student assignment document. The document may contain a mix of instructions, guides, questions, and answers.
114
- Your task is to go through the student document and extract answers to questions from the document step-by-step.
115
- Use the reference document as a guide.
116
- Use the keyword 'Question #' to identify each question.
117
- Then for its associated values, search the student document for the answer.
118
- If you do not see any answer in the student document, return 'No answer found'.
119
- Do not make up any answer.
120
- Remove any white spaces such as carriage returns.
121
- Return the original question and the student answer pairs as a key-value pair as Dict type.
122
- ---
123
-
124
- Reference Content:
125
- {{source}}
126
-
127
- Student Content:
128
- {{student}}
129
-
130
- Please extract the question-answer pairs and return them as JSON.
131
- """
132
-
133
- student_prompt_template = ChatPromptTemplate.from_template(student_prompt)
134
- student_response_chain = student_prompt_template | qa_chat_model
135
-
136
- def split_documents(documents: List[Document]) -> List[Document]:
137
- text_splitter = RecursiveCharacterTextSplitter(
138
- chunk_size=500,
139
- chunk_overlap=100,
140
- length_function=len,
141
- is_separator_regex=False
142
- )
143
- split_docs = text_splitter.split_documents(documents)
144
- total_tokens = sum(len(doc.page_content) for doc in split_docs) # Approximate token count
145
- return split_docs, total_tokens
146
-
147
- def generate_embeddings(docs: List[Document]) -> List[List[float]]:
148
- embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
149
- embeddings = embeddings_model.embed_documents([doc.page_content for doc in docs])
150
- total_tokens = sum(len(doc.page_content) for doc in docs) # Approximate token count
151
- return embeddings, total_tokens
152
-
153
- def prepare_files(zip_file_name: str):
154
- unzip_file(os.path.join(UPLOAD_FOLDER, zip_file_name), TEMP_DIR)
155
- documents = read_files_from_directory(os.path.join(TEMP_DIR, os.path.splitext(zip_file_name)[0]))
156
- reference_document = read_pdf(REFERENCE_DOCUMENT_PATH)
157
- return documents, reference_document
158
-
159
- def process_student(documents, reference):
160
- test_doc = documents[0]
161
- student_result = student_response_chain.invoke({"source": reference.keys(),"student": test_doc })
162
- student_gen_tokens = student_result.usage_metadata["total_tokens"]
163
- student_result = dict(extract_json(student_result)[0])
164
- return student_result, student_gen_tokens
165
-
166
- def process_reference(reference_document):
167
  result = ref_generation_chain.invoke({"source": reference_document})
168
  ref_gen_tokens = result.usage_metadata["total_tokens"]
169
  reference = dict(extract_json(result)[0])
@@ -174,120 +58,54 @@ def process_reference(reference_document):
174
  question_number = key.split('#')[1]
175
  answer_key = f'Answer #{question_number}'
176
  answers[key] = reference[answer_key]
177
-
178
  return reference, answers, ref_gen_tokens
179
 
180
- def split_docs(answers, student_result):
 
 
 
 
 
 
 
 
 
 
181
  split_reference_docs, ref_tokens = {}, 0
182
  split_student_docs, student_tokens = {}, 0
183
  for key, value in answers.items():
184
- split_docs, tokens = split_documents([Document(page_content=value)])
185
- split_reference_docs[key] = split_docs
186
  ref_tokens += tokens
187
 
188
  for key, value in student_result.items():
189
- split_docs, tokens = split_documents([Document(page_content=value)])
190
- split_student_docs[key] = split_docs
191
  student_tokens += tokens
192
 
193
  reference_embeddings = {key: generate_embeddings(value)[0] for key, value in split_reference_docs.items()}
194
  student_embeddings = {key: generate_embeddings(value)[0] for key, value in split_student_docs.items()}
195
-
196
  return reference_embeddings, student_embeddings, ref_tokens, student_tokens
197
 
198
- def compute_cosine_similarity(reference_embeddings: dict, student_embeddings: dict) -> float:
199
- similarity_results = {}
200
- for key in reference_embeddings.keys():
201
- if key not in student_embeddings:
202
- similarity_results[key] = 0
203
- continue
204
- reference_vector = np.array(reference_embeddings[key]).reshape(1, -1)
205
- student_vector = np.array(student_embeddings[key]).reshape(1, -1)
206
- if reference_vector.shape[1] != student_vector.shape[1]:
207
- min_dim = min(reference_vector.shape[1], student_vector.shape[1])
208
- reference_vector = reference_vector[:, :min_dim]
209
- student_vector = student_vector[:, :min_dim]
210
- similarity = cosine_similarity(reference_vector, student_vector)[0][0]
211
- similarity_results[key] = similarity
212
-
213
- total_similarity = sum(similarity_results.values())
214
- num_questions = len(similarity_results)
215
- average_similarity = total_similarity / num_questions if num_questions else 0
216
-
217
- return average_similarity
218
-
219
-
220
- def llm_similarity(answers, student_result):
221
- score_prompt = f"""
222
- You are given two dictionaries representing instructor solution and student answers.
223
- Your task is to go through each question to grade the correctness of student answer.
224
- Use the keyword 'Question #' to identify each question.
225
- Then for its associated values, compare student answer against the instructor answer.
226
- If the instructor answer has numerical values, check to make sure the student answer has the same number,
227
- whether it is expressed in numbers or text.
228
- If you do not see any answer in the student answer, assign score 0 for that answer.
229
- For student answer that is similar to instructor, assign a full score of 1.
230
- If the student answer is similar enough, assign a partial score of 0.5.
231
- Otherwise, assign a score of 0.
232
- Return the original question and the student score pairs as a key-value pair as Dict type.
233
- ---
234
-
235
- Reference Content:
236
- {{source}}
237
-
238
- Student Content:
239
- {{student}}
240
-
241
- Please extract the question-answer pairs and return them as JSON.
242
- """
243
-
244
- score_prompt_template = ChatPromptTemplate.from_template(score_prompt)
245
- student_score_chain = score_prompt_template | qa_chat_model
246
-
247
- student_score = student_score_chain.invoke({"source": answers, "student": student_result })
248
- llm_score_tokens = student_score.usage_metadata["total_tokens"]
249
- student_score = dict(extract_json(student_score)[0])
250
-
251
- total_score = sum(student_score.values())
252
- num_questions = len(student_score)
253
- average_score = total_score / num_questions if num_questions else 0
254
-
255
- return average_score, llm_score_tokens
256
-
257
- def process_data(zip_file_name: str) -> Tuple[float, float, int, int, int]:
258
  documents, reference_document = prepare_files(zip_file_name)
259
- reference, answers, ref_gen_tokens = process_reference(reference_document)
260
- student_result, student_gen_tokens = process_student(documents, reference)
261
- reference_embeddings, student_embeddings, ref_tokens, student_tokens = split_docs(answers, student_result)
262
  student_total_tokens = student_gen_tokens + student_tokens
263
  ref_total_tokens = ref_gen_tokens + ref_tokens
264
 
265
  average_similarity = compute_cosine_similarity(reference_embeddings, student_embeddings)
266
- average_score, llm_score_tokens = llm_similarity(answers, student_result)
267
  llm_total_tokens = ref_gen_tokens + student_gen_tokens + llm_score_tokens
268
 
269
  return average_similarity, average_score, ref_total_tokens, student_total_tokens, llm_total_tokens
270
 
271
- async def process_grading():
272
- global uploaded_file_name
273
- if uploaded_file_name:
274
- try:
275
- # Process the uploaded ZIP file
276
- average_similarity, average_score, ref_total_tokens, student_total_tokens, llm_total_tokens = process_data(uploaded_file_name)
277
-
278
- # Send results
279
- await cl.Message(content=f"Processing complete. Results:\n"
280
- f"Average Similarity: {average_similarity:.2f}\n"
281
- f"Average Score: {average_score:.2f}\n"
282
- f"Reference Total Tokens: {ref_total_tokens}\n"
283
- f"Student Total Tokens: {student_total_tokens}\n"
284
- f"LLM Total Tokens: {llm_total_tokens}").send()
285
- except Exception as e:
286
- await cl.Message(f"An error occurred while processing the zip file: {str(e)}").send()
287
- else:
288
- await cl.Message("No file has been uploaded yet. Please upload a ZIP file first.").send()
289
 
290
  user_wants_to_continue = False
 
291
 
292
  @cl.on_chat_start
293
  async def start():
@@ -301,11 +119,11 @@ async def start():
301
  ).send()
302
 
303
  zip_file = files[0] # Assuming only one file is uploaded
304
- file_path = os.path.join(UPLOAD_FOLDER, zip_file.name)
305
  uploaded_file_name = zip_file.name
306
 
307
- # Move the uploaded file to the desired location
308
- os.rename(zip_file.path, file_path)
309
 
310
  # Let the user know that the system is ready
311
  await cl.Message(content=f"`{zip_file.name}` uploaded successfully!").send()
@@ -313,6 +131,25 @@ async def start():
313
  # Ask if the user wants to proceed with grading
314
  await cl.Message(content="Do you want to proceed with the grading? (yes/no)").send()
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  @cl.on_message
317
  async def on_message(message: cl.Message):
318
  global user_wants_to_continue, uploaded_file_name
@@ -329,31 +166,11 @@ async def on_message(message: cl.Message):
329
  user_wants_to_continue = True
330
  await cl.Message(content="Do you want to continue? (yes/no)").send()
331
 
332
- elif user_wants_to_continue:
333
- if message.content.lower() == 'yes':
334
- user_wants_to_continue = False
335
- uploaded_file_name = None
336
- await cl.Message(content="Restarting the app...").send()
337
- await asyncio.sleep(1)
338
- python = sys.executable
339
- os.execl(python, python, *sys.argv)
340
-
341
- elif message.content.lower() == 'no':
342
- user_wants_to_continue = False
343
- uploaded_file_name = None
344
- await cl.Message(content="Okay, thank you for using the grading app. Restarting...").send()
345
- await asyncio.sleep(1)
346
- python = sys.executable
347
- os.execl(python, python, *sys.argv)
348
-
349
- else:
350
- await cl.Message(content="Invalid response. Please type 'yes' or 'no'.").send()
351
-
352
- elif message.content.lower() == 'no':
353
- await cl.Message(content="Okay, thank you for using the grading app. Restarting...").send()
354
- await asyncio.sleep(1)
355
- python = sys.executable
356
- os.execl(python, python, *sys.argv)
357
 
358
- else:
359
- await cl.Message(content="Please type 'yes' to start processing or 'no' to exit.").send()
 
 
 
 
 
9
  import logging
10
  import sys
11
  import os
12
+ import asyncio
13
+ import shutil
14
+ from readfile import prepare_files, USER_FILES_DIR
15
  from typing import List, Dict, Tuple
16
  from dotenv import load_dotenv
17
  from langchain_community.document_loaders import PyMuPDFLoader
 
28
  from sklearn.metrics.pairwise import cosine_similarity
29
  import chainlit as cl
30
  import asyncio
31
+ from readfile import prepare_files
32
+ from promptsplitembed import create_prompt, split_documents, generate_embeddings, create_qamodel
33
+ from extractjson import extract_json
34
+ from calcscore import compute_cosine_similarity, llm_similarity
35
+ from prompt_templates import ref_prompt, student_prompt, llm_score_prompt_template
36
 
37
  # Load environment variables
38
  load_dotenv()
 
42
  # Set up logging
43
  logging.basicConfig(level=logging.INFO)
44
  logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ qa_chat_model = create_qamodel(model="gpt-4o-mini", temperature=0
 
 
47
  )
48
+ def process_reference(reference_document, ref_prompt):
49
+ ref_prompt_template = create_prompt(ref_prompt)
50
+ ref_generation_chain = ref_prompt_template | qa_chat_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  result = ref_generation_chain.invoke({"source": reference_document})
52
  ref_gen_tokens = result.usage_metadata["total_tokens"]
53
  reference = dict(extract_json(result)[0])
 
58
  question_number = key.split('#')[1]
59
  answer_key = f'Answer #{question_number}'
60
  answers[key] = reference[answer_key]
61
+ print("Processed reference document")
62
  return reference, answers, ref_gen_tokens
63
 
64
+ def process_student(documents, reference, student_prompt):
65
+ test_doc = documents[0]
66
+ student_prompt_template = create_prompt(student_prompt)
67
+ student_response_chain = student_prompt_template | qa_chat_model
68
+ student_result = student_response_chain.invoke({"source": reference.keys(),"student": test_doc })
69
+ student_gen_tokens = student_result.usage_metadata["total_tokens"]
70
+ student_result = dict(extract_json(student_result)[0])
71
+ print("Processed student document")
72
+ return student_result, student_gen_tokens
73
+
74
+ def compare_docs(answers, student_result):
75
  split_reference_docs, ref_tokens = {}, 0
76
  split_student_docs, student_tokens = {}, 0
77
  for key, value in answers.items():
78
+ compare_docs, tokens = split_documents([Document(page_content=value)])
79
+ split_reference_docs[key] = compare_docs
80
  ref_tokens += tokens
81
 
82
  for key, value in student_result.items():
83
+ compare_docs, tokens = split_documents([Document(page_content=value)])
84
+ split_student_docs[key] = compare_docs
85
  student_tokens += tokens
86
 
87
  reference_embeddings = {key: generate_embeddings(value)[0] for key, value in split_reference_docs.items()}
88
  student_embeddings = {key: generate_embeddings(value)[0] for key, value in split_student_docs.items()}
89
+ print("Completed comparing student ans solution answers.")
90
  return reference_embeddings, student_embeddings, ref_tokens, student_tokens
91
 
92
+ def process_data(zip_file_name: str, prompt_template) -> Tuple[float, float, int, int, int]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  documents, reference_document = prepare_files(zip_file_name)
94
+ reference, answers, ref_gen_tokens = process_reference(reference_document, ref_prompt)
95
+ student_result, student_gen_tokens = process_student(documents, reference, student_prompt)
96
+ reference_embeddings, student_embeddings, ref_tokens, student_tokens = compare_docs(answers, student_result)
97
  student_total_tokens = student_gen_tokens + student_tokens
98
  ref_total_tokens = ref_gen_tokens + ref_tokens
99
 
100
  average_similarity = compute_cosine_similarity(reference_embeddings, student_embeddings)
101
+ average_score, llm_score_tokens = llm_similarity(answers, student_result, llm_score_prompt_template)
102
  llm_total_tokens = ref_gen_tokens + student_gen_tokens + llm_score_tokens
103
 
104
  return average_similarity, average_score, ref_total_tokens, student_total_tokens, llm_total_tokens
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  user_wants_to_continue = False
108
+ uploaded_file_name = None
109
 
110
  @cl.on_chat_start
111
  async def start():
 
119
  ).send()
120
 
121
  zip_file = files[0] # Assuming only one file is uploaded
122
+ file_path = os.path.join(USER_FILES_DIR, zip_file.name)
123
  uploaded_file_name = zip_file.name
124
 
125
+ # Move the uploaded file to the user files directory
126
+ shutil.move(zip_file.path, file_path)
127
 
128
  # Let the user know that the system is ready
129
  await cl.Message(content=f"`{zip_file.name}` uploaded successfully!").send()
 
131
  # Ask if the user wants to proceed with grading
132
  await cl.Message(content="Do you want to proceed with the grading? (yes/no)").send()
133
 
134
+ async def process_grading():
135
+ global uploaded_file_name
136
+ if uploaded_file_name:
137
+ try:
138
+ # Process the uploaded ZIP file
139
+ average_similarity, average_score, ref_total_tokens, student_total_tokens, llm_total_tokens = process_data(uploaded_file_name, llm_score_prompt_template)
140
+
141
+ # Send results
142
+ await cl.Message(content=f"Processing complete. Results:\n"
143
+ f"Average Similarity: {average_similarity:.2f}\n"
144
+ f"Average Score: {average_score:.2f}\n"
145
+ f"Reference Total Tokens: {ref_total_tokens}\n"
146
+ f"Student Total Tokens: {student_total_tokens}\n"
147
+ f"LLM Total Tokens: {llm_total_tokens}").send()
148
+ except Exception as e:
149
+ await cl.Message(content=f"An error occurred while processing the zip file: {str(e)}").send()
150
+ else:
151
+ await cl.Message(content="No file has been uploaded yet. Please upload a ZIP file first.").send()
152
+
153
  @cl.on_message
154
  async def on_message(message: cl.Message):
155
  global user_wants_to_continue, uploaded_file_name
 
166
  user_wants_to_continue = True
167
  await cl.Message(content="Do you want to continue? (yes/no)").send()
168
 
169
+ # ... rest of the function ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
+ if __name__ == "__main__":
172
+ # Ensure the user files directory exists
173
+ os.makedirs(USER_FILES_DIR, exist_ok=True)
174
+
175
+ # Your Chainlit app setup and run code here
176
+ cl.run()
prompt_templates.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ref_prompt = f"""
2
+ You are given a reference documents. The document contains a mix of instructions, guides, questions, and answers.
3
+ Your task is to go through the reference document and extract questions and answers from the document step-by-step.
4
+ Use the keyword 'Question #' to identify the start of each question.
5
+ Retain the following words until the 'Answer:' as the question.
6
+ Use the keyword 'Answer:' to identify the start of each answer.
7
+ Retain the follwing words until the 'Question:' as the answer, until the end of the document.
8
+ Remove any white spaces such as carriage returns.
9
+ Return the question-answer pairs as a key-value pair as Dict type.
10
+ ---
11
+
12
+ Reference Document Content:
13
+ {{source}}
14
+
15
+ Please extract the question-answer pairs and return them as JSON.
16
+ """
17
+
18
+ student_prompt = f"""
19
+ You are given a student assignment document. The document may contain a mix of instructions, guides, questions, and answers.
20
+ Your task is to go through the student document and extract answers to questions from the document step-by-step.
21
+ Use the reference document as a guide.
22
+ Use the keyword 'Question #' to identify each question.
23
+ Then for its associated values, search the student document for the answer.
24
+ If you do not see any answer in the student document, return 'No answer found'.
25
+ Do not make up any answer.
26
+ Remove any white spaces such as carriage returns.
27
+ Return the original question and the student answer pairs as a key-value pair as Dict type.
28
+ ---
29
+
30
+ Reference Content:
31
+ {{source}}
32
+
33
+ Student Content:
34
+ {{student}}
35
+
36
+ Please extract the question-answer pairs and return them as JSON.
37
+ """
38
+ llm_score_prompt_template = f"""
39
+ You are given two dictionaries representing instructor solution and student answers.
40
+ Your task is to go through each question to grade the correctness of student answer.
41
+ Use the keyword 'Question #' to identify each question.
42
+ Then for its associated values, compare student answer against the instructor answer.
43
+ If the instructor answer has numerical values, check to make sure the student answer has the same number,
44
+ whether it is expressed in numbers or text.
45
+ If you do not see any answer in the student answer, assign score 0 for that answer.
46
+ For student answer that is similar to instructor, assign a full score of 1.
47
+ If the student answer is similar enough, assign a partial score of 0.5.
48
+ Otherwise, assign a score of 0.
49
+ Return the original question and the student score pairs as a key-value pair as Dict type.
50
+ ---
51
+
52
+ Reference Content:
53
+ {{source}}
54
+
55
+ Student Content:
56
+ {{student}}
57
+
58
+ Please extract the question-answer pairs and return them as JSON.
59
+ """
promptsplitembed.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Tuple
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.schema import Document
5
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
6
+
7
+ def create_prompt(prompt):
8
+ prompt_template = ChatPromptTemplate.from_template(prompt)
9
+ return prompt_template
10
+
11
+ def split_documents(documents: List[Document]) -> List[Document]:
12
+ text_splitter = RecursiveCharacterTextSplitter(
13
+ chunk_size=500,
14
+ chunk_overlap=100,
15
+ length_function=len,
16
+ is_separator_regex=False
17
+ )
18
+ split_docs = text_splitter.split_documents(documents)
19
+ total_tokens = sum(len(doc.page_content) for doc in split_docs) # Approximate token count
20
+ return split_docs, total_tokens
21
+
22
+ def generate_embeddings(docs: List[Document]) -> List[List[float]]:
23
+ embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
24
+ embeddings = embeddings_model.embed_documents([doc.page_content for doc in docs])
25
+ total_tokens = sum(len(doc.page_content) for doc in docs) # Approximate token count
26
+ return embeddings, total_tokens
27
+
28
+ def create_qamodel(model="gpt-4o-mini", temperature=0):
29
+ qamodel = ChatOpenAI(
30
+ model="gpt-4o-mini",
31
+ temperature=0
32
+ )
33
+ return qamodel
readfile.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import zipfile
2
+ from typing import List
3
+ from langchain_community.document_loaders import PyMuPDFLoader
4
+ from langchain.schema import Document
5
+ from docx import Document as DocxDocument
6
+ import os
7
+
8
+ # Define constants
9
+ REFERENCE_DOCUMENT_PATH = './Excel Review.pdf'
10
+ USER_FILES_DIR = os.getenv('CHAINLIT_USER_FILES_DIR', '/tmp/chainlit_user_files')
11
+
12
+ # Ensure the user files directory exists
13
+ os.makedirs(USER_FILES_DIR, exist_ok=True)
14
+
15
+ def unzip_file(file_path: str, output_dir: str):
16
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
17
+ for member in zip_ref.namelist():
18
+ if not member.startswith('__MACOSX/'):
19
+ zip_ref.extract(member, output_dir)
20
+
21
+ def read_pdf(file_path: str) -> List[Document]:
22
+ loader = PyMuPDFLoader(file_path)
23
+ return loader.load()
24
+
25
+ def read_docx(file_path: str) -> Document:
26
+ doc = DocxDocument(file_path)
27
+ text = "\n".join([p.text for p in doc.paragraphs])
28
+ return Document(page_content=text, metadata={"source": file_path})
29
+
30
+ def read_files_from_directory(directory: str) -> List[Document]:
31
+ documents = []
32
+ for root, _, files in os.walk(directory):
33
+ for file in files:
34
+ file_path = os.path.join(root, file)
35
+ if os.path.basename(file_path).startswith('~$'):
36
+ continue # Skip temporary files
37
+ if file_path.endswith('.docx'):
38
+ documents.append(read_docx(file_path))
39
+ elif file_path.endswith('.pdf'):
40
+ documents.extend(read_pdf(file_path))
41
+ return documents
42
+
43
+ # Read file from user
44
+ def prepare_files(zip_file_name: str):
45
+ zip_file_path = os.path.join(USER_FILES_DIR, zip_file_name)
46
+ unzip_dir = os.path.join(USER_FILES_DIR, os.path.splitext(zip_file_name)[0])
47
+ unzip_file(zip_file_path, unzip_dir)
48
+ documents = read_files_from_directory(unzip_dir)
49
+ reference_document = read_pdf(REFERENCE_DOCUMENT_PATH)
50
+ print("Your file", zip_file_name, "has been successfully unzipped")
51
+ return documents, reference_document