Spaces:

MakPr016
/

qp-parser

Sleeping

MakPr016

QP Parser

d81169f about 1 month ago

8.41 kB

	import re
	from typing import Any

	SKIP_LINES = {
	"q. no.", "questions", "marks", "cl", "co", "co no.", "co description",
	"cognitive", "level", "remember", "understand", "apply", "analyze",
	"analyse", "evaluate", "create", "course outcomes", "usn", "no.",
	}

	BLOOMS_MAP = {
	"cl1": "CL1", "cl2": "CL2", "cl3": "CL3",
	"cl4": "CL4", "cl5": "CL5", "cl6": "CL6",
	"btl1": "CL1", "btl2": "CL2", "btl3": "CL3",
	"btl4": "CL4", "btl5": "CL5", "btl6": "CL6",
	"l1": "CL1", "l2": "CL2", "l3": "CL3",
	"l4": "CL4", "l5": "CL5", "l6": "CL6",
	}

	IS_PART_HEADER = re.compile(r'^\sPART\s+[A-Z]\s$', re.IGNORECASE)
	IS_OR = re.compile(r'^\sOR\s$', re.IGNORECASE)
	IS_PAGE_BREAK = re.compile(r'---\sPAGE BREAK\s---', re.IGNORECASE)
	IS_QUESTION_NUM = re.compile(r'^\s(\d{1,2})\s$')
	IS_SUB_LABEL = re.compile(r'^\s([a-e])\s$', re.IGNORECASE)
	IS_BLOOMS = re.compile(r'^\s(CL[1-6]\|BTL[1-6]\|L[1-6])\s$', re.IGNORECASE)
	IS_CO = re.compile(r'^\s(CO\s\d{1,2})\s*$', re.IGNORECASE)


	def normalise_co(raw: str) -> str:
	num = re.search(r'\d+', raw)
	return f"CO{num.group()}" if num else raw.strip().upper().replace(" ", "")


	def clean(line: str) -> str:
	return line.strip()


	def is_skip(line: str) -> bool:
	return clean(line).lower() in SKIP_LINES or not clean(line)


	# ── Token classifier ─────────────────────────────────────────────────────────
	# We first convert the raw lines into a flat token list, then run the state
	# machine over tokens. This avoids ambiguity between a standalone "4" being
	# a marks value vs a question number.

	TOKEN_PART = "PART"
	TOKEN_OR = "OR"
	TOKEN_QNUM = "QNUM" # question number (context-dependent)
	TOKEN_SUB = "SUB" # a / b / c / d / e
	TOKEN_BLOOMS = "BLOOMS"
	TOKEN_CO = "CO"
	TOKEN_MARKS = "MARKS" # 1-25
	TOKEN_TEXT = "TEXT"


	def tokenise(lines: list[str]) -> list[tuple[str, str]]:
	tokens: list[tuple[str, str]] = []
	for raw in lines:
	line = clean(raw)
	if not line:
	continue
	if IS_PAGE_BREAK.match(line):
	continue
	if is_skip(line):
	continue
	if IS_PART_HEADER.match(line):
	tokens.append((TOKEN_PART, line))
	continue
	if IS_OR.match(line):
	tokens.append((TOKEN_OR, line))
	continue
	if IS_BLOOMS.match(line):
	key = line.strip().upper().replace(" ", "")
	tokens.append((TOKEN_BLOOMS, BLOOMS_MAP.get(key.lower(), key)))
	continue
	if IS_CO.match(line):
	tokens.append((TOKEN_CO, normalise_co(line)))
	continue
	if IS_SUB_LABEL.match(line):
	tokens.append((TOKEN_SUB, line.strip().lower()))
	continue
	if IS_QUESTION_NUM.match(line):
	# Ambiguous — could be marks or question number.
	# We emit a special AMBIG token and resolve in the state machine.
	tokens.append(("AMBIG", line.strip()))
	continue
	tokens.append((TOKEN_TEXT, line))
	return tokens


	def parse_question_paper(text: str) -> dict[str, Any]:
	lines = text.splitlines()
	tokens = tokenise(lines)
	warnings: list[str] = []

	parts: list[dict] = []
	part_number = 0
	current_q1: int \| None = None
	current_q2: int \| None = None
	after_or = False

	# per-sub state
	sub_label = ""
	marks = 0
	blooms = ""
	co_number = ""
	in_sub = False
	# sequence tracker: after SUB we expect TEXT* MARKS BLOOMS CO
	saw_marks = False
	saw_blooms = False

	def get_or_create_part() -> dict:
	for p in parts:
	if p["partNumber"] == part_number:
	return p
	p = {
	"partNumber": part_number,
	"question1Number": current_q1 or 1,
	"question2Number": current_q2 or (current_q1 + 1 if current_q1 else 2),
	"subQuestions": [],
	}
	parts.append(p)
	return p

	def commit_sub():
	nonlocal in_sub, sub_label, marks, blooms, co_number, saw_marks, saw_blooms
	p = get_or_create_part()
	if after_or and current_q2:
	p["question2Number"] = current_q2
	p["subQuestions"].append({
	"label": sub_label,
	"marks": marks,
	"bloomsLevel": blooms,
	"co_number": co_number,
	})
	in_sub = False
	sub_label = ""
	marks = 0
	blooms = ""
	co_number = ""
	saw_marks = False
	saw_blooms = False

	def reset_sub():
	nonlocal in_sub, sub_label, marks, blooms, co_number, saw_marks, saw_blooms
	in_sub = False
	sub_label = ""
	marks = 0
	blooms = ""
	co_number = ""
	saw_marks = False
	saw_blooms = False

	for kind, value in tokens:

	# ── PART / OR / QNUM bookkeeping (only when not mid-sub) ────────────
	if kind == TOKEN_PART:
	reset_sub()
	after_or = False
	continue

	if kind == TOKEN_OR:
	if in_sub:
	reset_sub()
	after_or = True
	continue

	# AMBIG: resolve as QNUM only when we are NOT waiting for marks/blooms/CO
	if kind == "AMBIG":
	val = int(value)
	if in_sub and not saw_marks and 1 <= val <= 25:
	# treat as marks
	marks = val
	saw_marks = True
	continue
	elif not in_sub:
	# treat as question number
	if not after_or:
	current_q1 = val
	current_q2 = val + 1
	part_number += 1
	else:
	current_q2 = val
	for p in parts:
	if p["partNumber"] == part_number:
	p["question2Number"] = val
	continue
	# else it's noise (e.g. array values in question text) — skip
	continue

	# ── Sub-question label ───────────────────────────────────────────────
	if kind == TOKEN_SUB:
	if in_sub and saw_marks and saw_blooms and co_number:
	commit_sub()
	elif in_sub:
	reset_sub()
	sub_label = value
	in_sub = True
	continue

	# ── Within a sub-question ────────────────────────────────────────────
	if in_sub:
	if kind == TOKEN_BLOOMS and saw_marks:
	blooms = value
	saw_blooms = True
	continue

	if kind == TOKEN_CO and saw_blooms:
	co_number = value
	commit_sub()
	continue

	# TEXT is question body — ignore for extraction
	if kind == TOKEN_TEXT:
	continue

	# Anything else outside a sub — ignore
	continue

	# flush
	if in_sub and marks > 0:
	commit_sub()

	# ── Warnings ─────────────────────────────────────────────────────────────
	if not parts:
	warnings.append("No sub-questions detected. The paper layout may be unusual.")
	else:
	missing_co = sum(1 for p in parts for sq in p["subQuestions"] if not sq["co_number"])
	missing_bl = sum(1 for p in parts for sq in p["subQuestions"] if not sq["bloomsLevel"])
	if missing_co:
	warnings.append(f"{missing_co} sub-question(s) have no CO detected — please fill manually.")
	if missing_bl:
	warnings.append(f"{missing_bl} sub-question(s) have no Bloom's level detected — please fill manually.")

	total_marks = sum(sq["marks"] for p in parts for sq in p["subQuestions"])

	return {
	"question_parts": parts,
	"total_marks": total_marks,
	"warnings": warnings,
	}