qp-parser / app /parser.py
MakPr016
QP Parser
d81169f
import re
from typing import Any
SKIP_LINES = {
"q. no.", "questions", "marks", "cl", "co", "co no.", "co description",
"cognitive", "level", "remember", "understand", "apply", "analyze",
"analyse", "evaluate", "create", "course outcomes", "usn", "no.",
}
BLOOMS_MAP = {
"cl1": "CL1", "cl2": "CL2", "cl3": "CL3",
"cl4": "CL4", "cl5": "CL5", "cl6": "CL6",
"btl1": "CL1", "btl2": "CL2", "btl3": "CL3",
"btl4": "CL4", "btl5": "CL5", "btl6": "CL6",
"l1": "CL1", "l2": "CL2", "l3": "CL3",
"l4": "CL4", "l5": "CL5", "l6": "CL6",
}
IS_PART_HEADER = re.compile(r'^\s*PART\s+[A-Z]\s*$', re.IGNORECASE)
IS_OR = re.compile(r'^\s*OR\s*$', re.IGNORECASE)
IS_PAGE_BREAK = re.compile(r'---\s*PAGE BREAK\s*---', re.IGNORECASE)
IS_QUESTION_NUM = re.compile(r'^\s*(\d{1,2})\s*$')
IS_SUB_LABEL = re.compile(r'^\s*([a-e])\s*$', re.IGNORECASE)
IS_BLOOMS = re.compile(r'^\s*(CL[1-6]|BTL[1-6]|L[1-6])\s*$', re.IGNORECASE)
IS_CO = re.compile(r'^\s*(CO\s*\d{1,2})\s*$', re.IGNORECASE)
def normalise_co(raw: str) -> str:
num = re.search(r'\d+', raw)
return f"CO{num.group()}" if num else raw.strip().upper().replace(" ", "")
def clean(line: str) -> str:
return line.strip()
def is_skip(line: str) -> bool:
return clean(line).lower() in SKIP_LINES or not clean(line)
# ── Token classifier ─────────────────────────────────────────────────────────
# We first convert the raw lines into a flat token list, then run the state
# machine over tokens. This avoids ambiguity between a standalone "4" being
# a marks value vs a question number.
TOKEN_PART = "PART"
TOKEN_OR = "OR"
TOKEN_QNUM = "QNUM" # question number (context-dependent)
TOKEN_SUB = "SUB" # a / b / c / d / e
TOKEN_BLOOMS = "BLOOMS"
TOKEN_CO = "CO"
TOKEN_MARKS = "MARKS" # 1-25
TOKEN_TEXT = "TEXT"
def tokenise(lines: list[str]) -> list[tuple[str, str]]:
tokens: list[tuple[str, str]] = []
for raw in lines:
line = clean(raw)
if not line:
continue
if IS_PAGE_BREAK.match(line):
continue
if is_skip(line):
continue
if IS_PART_HEADER.match(line):
tokens.append((TOKEN_PART, line))
continue
if IS_OR.match(line):
tokens.append((TOKEN_OR, line))
continue
if IS_BLOOMS.match(line):
key = line.strip().upper().replace(" ", "")
tokens.append((TOKEN_BLOOMS, BLOOMS_MAP.get(key.lower(), key)))
continue
if IS_CO.match(line):
tokens.append((TOKEN_CO, normalise_co(line)))
continue
if IS_SUB_LABEL.match(line):
tokens.append((TOKEN_SUB, line.strip().lower()))
continue
if IS_QUESTION_NUM.match(line):
# Ambiguous β€” could be marks or question number.
# We emit a special AMBIG token and resolve in the state machine.
tokens.append(("AMBIG", line.strip()))
continue
tokens.append((TOKEN_TEXT, line))
return tokens
def parse_question_paper(text: str) -> dict[str, Any]:
lines = text.splitlines()
tokens = tokenise(lines)
warnings: list[str] = []
parts: list[dict] = []
part_number = 0
current_q1: int | None = None
current_q2: int | None = None
after_or = False
# per-sub state
sub_label = ""
marks = 0
blooms = ""
co_number = ""
in_sub = False
# sequence tracker: after SUB we expect TEXT* MARKS BLOOMS CO
saw_marks = False
saw_blooms = False
def get_or_create_part() -> dict:
for p in parts:
if p["partNumber"] == part_number:
return p
p = {
"partNumber": part_number,
"question1Number": current_q1 or 1,
"question2Number": current_q2 or (current_q1 + 1 if current_q1 else 2),
"subQuestions": [],
}
parts.append(p)
return p
def commit_sub():
nonlocal in_sub, sub_label, marks, blooms, co_number, saw_marks, saw_blooms
p = get_or_create_part()
if after_or and current_q2:
p["question2Number"] = current_q2
p["subQuestions"].append({
"label": sub_label,
"marks": marks,
"bloomsLevel": blooms,
"co_number": co_number,
})
in_sub = False
sub_label = ""
marks = 0
blooms = ""
co_number = ""
saw_marks = False
saw_blooms = False
def reset_sub():
nonlocal in_sub, sub_label, marks, blooms, co_number, saw_marks, saw_blooms
in_sub = False
sub_label = ""
marks = 0
blooms = ""
co_number = ""
saw_marks = False
saw_blooms = False
for kind, value in tokens:
# ── PART / OR / QNUM bookkeeping (only when not mid-sub) ────────────
if kind == TOKEN_PART:
reset_sub()
after_or = False
continue
if kind == TOKEN_OR:
if in_sub:
reset_sub()
after_or = True
continue
# AMBIG: resolve as QNUM only when we are NOT waiting for marks/blooms/CO
if kind == "AMBIG":
val = int(value)
if in_sub and not saw_marks and 1 <= val <= 25:
# treat as marks
marks = val
saw_marks = True
continue
elif not in_sub:
# treat as question number
if not after_or:
current_q1 = val
current_q2 = val + 1
part_number += 1
else:
current_q2 = val
for p in parts:
if p["partNumber"] == part_number:
p["question2Number"] = val
continue
# else it's noise (e.g. array values in question text) β€” skip
continue
# ── Sub-question label ───────────────────────────────────────────────
if kind == TOKEN_SUB:
if in_sub and saw_marks and saw_blooms and co_number:
commit_sub()
elif in_sub:
reset_sub()
sub_label = value
in_sub = True
continue
# ── Within a sub-question ────────────────────────────────────────────
if in_sub:
if kind == TOKEN_BLOOMS and saw_marks:
blooms = value
saw_blooms = True
continue
if kind == TOKEN_CO and saw_blooms:
co_number = value
commit_sub()
continue
# TEXT is question body β€” ignore for extraction
if kind == TOKEN_TEXT:
continue
# Anything else outside a sub β€” ignore
continue
# flush
if in_sub and marks > 0:
commit_sub()
# ── Warnings ─────────────────────────────────────────────────────────────
if not parts:
warnings.append("No sub-questions detected. The paper layout may be unusual.")
else:
missing_co = sum(1 for p in parts for sq in p["subQuestions"] if not sq["co_number"])
missing_bl = sum(1 for p in parts for sq in p["subQuestions"] if not sq["bloomsLevel"])
if missing_co:
warnings.append(f"{missing_co} sub-question(s) have no CO detected β€” please fill manually.")
if missing_bl:
warnings.append(f"{missing_bl} sub-question(s) have no Bloom's level detected β€” please fill manually.")
total_marks = sum(sq["marks"] for p in parts for sq in p["subQuestions"])
return {
"question_parts": parts,
"total_marks": total_marks,
"warnings": warnings,
}