woyeso commited on
Commit
e476825
·
verified ·
1 Parent(s): e400569

create app.py

Browse files
Files changed (1) hide show
  1. app.py +415 -0
app.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import logging
4
+ import json
5
+ import streamlit as st
6
+ import pdfplumber
7
+ from docx import Document
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
9
+ from peft import PeftModel
10
+ import torch
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.DEBUG)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Paths to rubric files
17
+ P1_RUBRICS_PATH = os.path.join("data", "rubrics", "p1_rubrics.json")
18
+ P2_RUBRICS_PATH = os.path.join("data", "rubrics", "p2_rubrics.json")
19
+
20
+ # Load rubrics from JSON files
21
+ def load_rubrics(project_type):
22
+ rubric_file = P1_RUBRICS_PATH if project_type.lower() == "group" else P2_RUBRICS_PATH
23
+ try:
24
+ with open(rubric_file, "r") as f:
25
+ return json.load(f)
26
+ except FileNotFoundError:
27
+ raise FileNotFoundError(f"Rubric file not found: {rubric_file}")
28
+ except json.JSONDecodeError:
29
+ raise ValueError(f"Error decoding JSON from {rubric_file}")
30
+
31
+ # Load model and tokenizer
32
+ @st.cache_resource
33
+ def load_model():
34
+ adapter_model_name = "woyeso/fine_tuned_llama_3_2_assignment_grader"
35
+ base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Adjust if the base model differs
36
+ hf_token = os.getenv("HF_TOKEN")
37
+
38
+ tokenizer = AutoTokenizer.from_pretrained(
39
+ adapter_model_name,
40
+ token=hf_token if hf_token else None
41
+ )
42
+
43
+ base_model = AutoModelForCausalLM.from_pretrained(
44
+ base_model_name,
45
+ torch_dtype=torch.float16,
46
+ device_map="auto",
47
+ token=hf_token if hf_token else None
48
+ )
49
+
50
+ model = PeftModel.from_pretrained(base_model, adapter_model_name, token=hf_token if hf_token else None)
51
+ return model, tokenizer
52
+
53
+ model, tokenizer = load_model()
54
+
55
+ # Subcomponent mappings (same as original)
56
+ P1_SUBCOMPONENTS = {
57
+ '1.1': 'Information of the Service Recipients Found:',
58
+ '1.2': 'Information Related to the Use of AI in Teaching and Learning:',
59
+ '1.3': 'Service Project Title and Topics:',
60
+ '1.4': 'Specific Project Objectives:',
61
+ '2.1': 'Design of AI-Related Ice-breaking Games:',
62
+ '2.2': 'Tasks of Each Team Member:',
63
+ '3.1': 'Specific STEM Elements Covered:',
64
+ '3.2': 'Student Abilities to Strengthen:',
65
+ '3.3': 'Potential Learning Hurdles of Students:',
66
+ '3.4': 'Facilitating STEM and Overcoming Hurdles:',
67
+ '4.1': 'List of Materials and Parts:',
68
+ '4.2': 'List of Tools:'
69
+ }
70
+
71
+ P2_SUBCOMPONENTS = {
72
+ '1.1': 'Specific Learning Objectives:',
73
+ '1.2': 'Content of Each Teaching Kit:',
74
+ '2.1': 'Describe the Design of Each Teaching Kit:',
75
+ '2.2': 'How to Prepare (or Make) Each Item of Your Teaching Kit:',
76
+ '2.3': 'Explain Why Students Will Learn and Play Happily:',
77
+ '3.1': 'Draw a Diagram to Illustrate Task Breakdown:',
78
+ '4.1': 'How to Introduce the Specific Topic(s) to Arouse Interest in STEM:',
79
+ '4.2': 'How to Identify and Overcome Learning Hurdles:',
80
+ '5.1': 'How to React to Potential Uncertainties:',
81
+ '5.2': 'How to Self-Evaluate Performance and Make Improvements:'
82
+ }
83
+
84
+ # Text extraction functions (unchanged)
85
+ def extract_text_between_strings(text, start_keyword, end_keyword):
86
+ try:
87
+ extracted_text = ""
88
+ start_match = re.search(start_keyword, text, re.MULTILINE)
89
+ if not start_match:
90
+ logger.debug(f"Start keyword '{start_keyword}' not found.")
91
+ return "Not Found"
92
+
93
+ start_index = start_match.end()
94
+ end_match = re.search(end_keyword, text, re.MULTILINE)
95
+ if end_match and end_match.start() > start_match.start():
96
+ end_index = end_match.start()
97
+ extracted_text = text[start_index:end_index].strip()
98
+ else:
99
+ extracted_text = text[start_index:].strip()
100
+
101
+ if not extracted_text:
102
+ logger.debug(f"End keyword '{end_keyword}' not found or no content extracted.")
103
+ return "Not Found"
104
+
105
+ lines = extracted_text.split('\n')
106
+ formatted_lines = []
107
+ bullet_pattern = re.compile(r'^\s*(\d+\.|\•|-|◦|➢)\s*(.+)$')
108
+ for line in lines:
109
+ line = line.strip()
110
+ if not line:
111
+ continue
112
+ bullet_match = bullet_pattern.match(line)
113
+ if bullet_match:
114
+ bullet, text = bullet_match.groups()
115
+ formatted_lines.append(f"{bullet} {text}")
116
+ else:
117
+ formatted_lines.append(line)
118
+ cleaned_text = "\n".join(formatted_lines).strip()
119
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text.replace('\n', '\n '))
120
+ return cleaned_text.replace("XYZ students", "Hong Chi students")
121
+
122
+ except Exception as e:
123
+ logger.error(f"Error extracting text: {e}")
124
+ return f"Error: {e}"
125
+
126
+ def extract_text_from_pdf(filepath, assignment_type='P1'):
127
+ results = {}
128
+ subcomponents = P1_SUBCOMPONENTS if assignment_type == 'P1' else P2_SUBCOMPONENTS
129
+ sorted_codes = sorted(subcomponents.keys(), key=lambda x: [int(n) for n in x.split('.')])
130
+
131
+ with pdfplumber.open(filepath) as pdf:
132
+ text = ""
133
+ for page in pdf.pages:
134
+ page_text = page.extract_text() or ""
135
+ text += page_text + "\n"
136
+
137
+ for i, code in enumerate(sorted_codes):
138
+ start_keyword = r"^{}\s*[.:]?\s*".format(re.escape(code))
139
+ if i + 1 < len(sorted_codes):
140
+ end_keyword = r"^{}\s*[.:]?\s*".format(re.escape(sorted_codes[i + 1]))
141
+ else:
142
+ end_keyword = r"^5\.\s*" if assignment_type == 'P1' else r"^6\.\s*"
143
+
144
+ logger.debug(f"Extracting section {code} with start_keyword={start_keyword}, end_keyword={end_keyword}")
145
+ content = extract_text_between_strings(text, start_keyword, end_keyword)
146
+ results[code] = {
147
+ "title": subcomponents[code],
148
+ "content": content
149
+ }
150
+
151
+ return results
152
+
153
+ def extract_text_from_docx(filepath, assignment_type='P1'):
154
+ try:
155
+ doc = Document(filepath)
156
+ elements = []
157
+ for para in doc.paragraphs:
158
+ text = para.text.strip()
159
+ if text:
160
+ style = para.style.name
161
+ elements.append(('paragraph', text, style))
162
+ for table in doc.tables:
163
+ table_text = []
164
+ for row in table.rows:
165
+ row_text = [cell.text.strip() for cell in row.cells if cell.text.strip()]
166
+ if row_text:
167
+ table_text.append(" ".join(row_text))
168
+ if table_text:
169
+ elements.append(('table', "\n".join(table_text), 'Table'))
170
+
171
+ logger.debug(f"Extracted {len(elements)} elements from DOCX")
172
+
173
+ results = {}
174
+ subcomponents = P1_SUBCOMPONENTS if assignment_type == 'P1' else P2_SUBCOMPONENTS
175
+ sorted_codes = sorted(subcomponents.keys(), key=lambda x: [int(n) for n in x.split('.')])
176
+
177
+ current_section = None
178
+ section_content = []
179
+ section_pattern = re.compile(r'^\s*(\d+\.\d+\.?)\s*[.:]?\s*(.*)?$')
180
+ end_pattern = re.compile(r'^\s*5\.\s*' if assignment_type == 'P1' else r'^\s*6\.\s*')
181
+ bullet_pattern = re.compile(r'^\s*(\d+\.|\•|-|◦|➢)\s*(.+)$')
182
+
183
+ for i, (elem_type, text, style) in enumerate(elements):
184
+ logger.debug(f"Processing element {i}: type={elem_type}, style={style}, text={text[:100]}...")
185
+
186
+ lines = text.split('\n')
187
+ for line in lines:
188
+ line = line.strip()
189
+ if not line:
190
+ continue
191
+
192
+ section_match = section_pattern.match(line)
193
+ if section_match:
194
+ code, title = section_match.groups()
195
+ code = code.rstrip('.')
196
+ if current_section and current_section in subcomponents:
197
+ formatted_lines = []
198
+ for content_line in section_content:
199
+ bullet_match = bullet_pattern.match(content_line)
200
+ if bullet_match:
201
+ bullet, text = bullet_match.groups()
202
+ formatted_lines.append(f"{bullet} {text}")
203
+ else:
204
+ formatted_lines.append(content_line)
205
+ cleaned_content = "\n".join(formatted_lines).strip()
206
+ cleaned_content = re.sub(r'\s+', ' ', cleaned_content.replace('\n', '\n '))
207
+ cleaned_content = cleaned_content.replace("XYZ students", "Hong Chi students")
208
+ results[current_section] = {
209
+ "title": subcomponents[current_section],
210
+ "content": cleaned_content if cleaned_content else "Not Found"
211
+ }
212
+ current_section = code
213
+ section_content = []
214
+ if title:
215
+ section_content.append(title)
216
+ logger.debug(f"Started section {code} at element {i}")
217
+ continue
218
+
219
+ end_match = end_pattern.match(line)
220
+ if end_match and current_section:
221
+ formatted_lines = []
222
+ for content_line in section_content:
223
+ bullet_match = bullet_pattern.match(content_line)
224
+ if bullet_match:
225
+ bullet, text = bullet_match.groups()
226
+ formatted_lines.append(f"{bullet} {text}")
227
+ else:
228
+ formatted_lines.append(content_line)
229
+ cleaned_content = "\n".join(formatted_lines).strip()
230
+ cleaned_content = re.sub(r'\s+', ' ', cleaned_content.replace('\n', '\n '))
231
+ cleaned_content = cleaned_content.replace("XYZ students", "Hong Chi students")
232
+ results[current_section] = {
233
+ "title": subcomponents[current_section],
234
+ "content": cleaned_content if cleaned_content else "Not Found"
235
+ }
236
+ current_section = None
237
+ section_content = []
238
+ logger.debug(f"Ended section at element {i} with end marker")
239
+ continue
240
+
241
+ if current_section:
242
+ if style.startswith('List') or bullet_pattern.match(line):
243
+ bullet_match = bullet_pattern.match(line)
244
+ if bullet_match:
245
+ bullet, text = bullet_match.groups()
246
+ section_content.append(f"{bullet} {text}")
247
+ else:
248
+ section_content.append(f"- {line}")
249
+ else:
250
+ section_content.append(line)
251
+
252
+ if current_section and current_section in subcomponents:
253
+ formatted_lines = []
254
+ for content_line in section_content:
255
+ bullet_match = bullet_pattern.match(content_line)
256
+ if bullet_match:
257
+ bullet, text = bullet_match.groups()
258
+ formatted_lines.append(f"{bullet} {text}")
259
+ else:
260
+ formatted_lines.append(content_line)
261
+ cleaned_content = "\n".join(formatted_lines).strip()
262
+ cleaned_content = re.sub(r'\s+', ' ', cleaned_content.replace('\n', '\n '))
263
+ cleaned_content = cleaned_content.replace("XYZ students", "Hong Chi students")
264
+ results[current_section] = {
265
+ "title": subcomponents[current_section],
266
+ "content": cleaned_content if cleaned_content else "Not Found"
267
+ }
268
+
269
+ for code in sorted_codes:
270
+ if code not in results:
271
+ results[code] = {
272
+ "title": subcomponents[code],
273
+ "content": "Not Found"
274
+ }
275
+ logger.debug(f"Subcomponent {code} not found in DOCX")
276
+
277
+ return results
278
+
279
+ except Exception as e:
280
+ logger.error(f"Error extracting text from DOCX: {e}")
281
+ return {}
282
+
283
+ # Function to evaluate submission using the model
284
+ def evaluate_submission(subcomponent, project_type, rubric, submission, school_name):
285
+ prompt = (
286
+ f"Can you evaluate my project submission for Subcomponent {subcomponent} in a {project_type} project (P1 for group, P2 for individual).\n"
287
+ f"Here is the rubric: {rubric}. Evaluate the submission against each rubric criterion. Focus on the rubric criteria as the primary basis for your evaluation.\n"
288
+ f"My submission is {submission}.\n\n"
289
+ f"If a school name is provided, use it in your evaluation: {school_name}. If no school name is provided, refer to the students generically as 'students'.\n"
290
+ f"Do not use the placeholder 'XYZ students' in your evaluation, as it was used during training but should be replaced with the specific school name or 'students'.\n\n"
291
+ f"Summarize the strengths of the submission (what it does well according to the rubric).\n"
292
+ f"Summarize the weaknesses of the submission (where it falls short according to the rubric).\n"
293
+ f"Provide specific suggestions for improvement to help the student improve their submission.\n\n"
294
+ f"Give me an overall mark out of 10, and don't be too strict. Ensure you provide the score in the format: <Overall Mark: X/10>. Do not omit the score and follow format of X/10."
295
+ )
296
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
297
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
298
+
299
+ with torch.no_grad():
300
+ outputs = model.generate(
301
+ input_ids=inputs["input_ids"],
302
+ attention_mask=inputs["attention_mask"],
303
+ max_new_tokens=256,
304
+ temperature=0.7,
305
+ top_p=0.9,
306
+ do_sample=True
307
+ )
308
+ feedback = tokenizer.decode(outputs[0], skip_special_tokens=True)
309
+ return feedback
310
+
311
+ # Streamlit app
312
+ st.title("Assignment Grader App")
313
+
314
+ # File upload
315
+ uploaded_file = st.file_uploader("Upload PDF/DOCX", type=["pdf", "docx"])
316
+ project_type = st.selectbox("Project Type", ["Group (P1)", "Individual (P2)"])
317
+ school_name = st.text_input("School Name (Optional)")
318
+ group_number = st.text_input("Group Number (Optional)")
319
+
320
+ # Manual text input as fallback
321
+ manual_text = st.text_area("Or enter your submission text manually (optional)")
322
+
323
+ if st.button("Evaluate"):
324
+ if uploaded_file or manual_text:
325
+ # Load rubrics
326
+ project_type_short = "Group" if project_type == "Group (P1)" else "Individual"
327
+ project = "P1" if project_type == "Group (P1)" else "P2"
328
+ try:
329
+ rubrics = load_rubrics(project_type_short)
330
+ except Exception as e:
331
+ st.error(f"Error loading rubrics: {str(e)}")
332
+ st.stop()
333
+
334
+ # Extract text from file or use manual input
335
+ submission_dict = {}
336
+ if uploaded_file:
337
+ with open("/tmp/uploaded_file", "wb") as f:
338
+ f.write(uploaded_file.read())
339
+
340
+ if uploaded_file.name.endswith(".pdf"):
341
+ results = extract_text_from_pdf("/tmp/uploaded_file", project)
342
+ else:
343
+ results = extract_text_from_docx("/tmp/uploaded_file", project)
344
+ os.remove("/tmp/uploaded_file")
345
+
346
+ for subcomponent, data in results.items():
347
+ if data["content"] != "Not Found":
348
+ submission_dict[subcomponent] = data["content"]
349
+ else:
350
+ submission_dict["1.1"] = manual_text # Simplified for manual input; adjust as needed
351
+
352
+ if not submission_dict:
353
+ st.error("No text extracted from the file or provided manually.")
354
+ st.stop()
355
+
356
+ # Evaluate submissions
357
+ evaluations = []
358
+ total_score = 0
359
+ total_weight = 0
360
+
361
+ with st.spinner("Evaluating submission..."):
362
+ for rubric in rubrics:
363
+ subcomponent = rubric["subcomponent"]
364
+ if subcomponent not in submission_dict:
365
+ continue
366
+
367
+ submission = submission_dict[subcomponent]
368
+ evaluation = evaluate_submission(
369
+ subcomponent,
370
+ project_type_short,
371
+ rubric["criteria"],
372
+ submission,
373
+ school_name if school_name else "Not provided"
374
+ )
375
+
376
+ if school_name:
377
+ evaluation = evaluation.replace("XYZ students", f"{school_name} students")
378
+ else:
379
+ evaluation = evaluation.replace("XYZ students", "students")
380
+
381
+ score_match = re.search(r"Overall Mark:\s*([\d.]+)(?:\s*/\s*10)?", evaluation, re.IGNORECASE)
382
+ score = float(score_match.group(1)) if score_match else 0
383
+
384
+ weight = rubric.get("weight", 1.0)
385
+ total_score += score * weight
386
+ total_weight += weight
387
+
388
+ evaluations.append({
389
+ "subcomponent": subcomponent,
390
+ "evaluation": evaluation,
391
+ "score": score,
392
+ "weight": weight
393
+ })
394
+
395
+ # Calculate final grade
396
+ final_grade = (total_score / total_weight) * 10 if total_weight > 0 else 0
397
+ final_grade = round(final_grade, 2)
398
+
399
+ # Display results
400
+ group_display = f" {group_number}" if group_number else ""
401
+ summary = f"**Summary of Evaluations for {project} Project (Group{group_display})**\n\n"
402
+ separator = "********************************************************************\n"
403
+ for i, eval in enumerate(evaluations):
404
+ summary += f"**Subcomponent {eval['subcomponent']} (Weight: {eval['weight']*100}%)**\n"
405
+ summary += eval["evaluation"]
406
+ summary += "\n\n"
407
+ if i < len(evaluations) - 1:
408
+ summary += separator
409
+
410
+ summary += f"**Final Total Grade: {final_grade}%**"
411
+
412
+ st.subheader("Evaluation Results")
413
+ st.markdown(summary)
414
+ else:
415
+ st.error("Please upload a file or enter text manually.")