Spaces:

hugging4rajiv
/

rajiv.r

Running

App Files Files Community

rajiv.r / parse_cv_to_json.py

hugging4rajiv

Update parse_cv_to_json.py

0cb5582 verified 4 months ago

raw

history blame contribute delete

9.87 kB

	import re, json, pathlib, traceback

	# --- robust paths relative to this file (works on Spaces & local) ---
	BASE = pathlib.Path(__file__).parent.resolve()
	TEX_PATH = str((BASE / "cv.tex").resolve())
	OUT_JSON = str((BASE / "cv.json").resolve())

	DEFAULT_LINKEDIN = "https://www.linkedin.com/in/rajiv-ranjan-jha-248a59103"

	# ------------------------ utilities ------------------------

	def safe_sub(pattern, repl, s):
	try:
	return re.sub(pattern, repl, s)
	except re.error:
	# leave text unchanged if a pattern trips Python's regex engine
	return s

	def load_tex(path):
	text = pathlib.Path(path).read_text(encoding="utf-8", errors="ignore")
	# collapse tabs/multi-spaces so regex is simpler
	return re.sub(r"[ \t]+", " ", text)

	def clean_tex(s):
	s = s or ""
	s = s.replace("~", " ")
	s = safe_sub(r"\\textbf\{([^}]*)\}", r"\1", s)
	s = safe_sub(r"\\textit\{([^}]*)\}", r"\1", s)
	s = safe_sub(r"\\&", "&", s)
	s = safe_sub(r"\\%", "%", s)
	s = safe_sub(r"\\_", "_", s)
	s = safe_sub(r"\\textsuperscript\{[^}]*\}", "", s)
	s = safe_sub(r"\\href\{([^}])\}\{([^}])\}", r"\2 (\1)", s)
	s = safe_sub(r"\\mbox\{([^}]*)\}", r"\1", s)
	s = re.sub(r"\s+", " ", s).strip()
	return s

	def _normalize_url(u: str) -> str:
	if not u:
	return DEFAULT_LINKEDIN
	u = u.strip()
	if u.startswith("www."):
	u = "https://" + u
	elif u.startswith("linkedin.com"):
	u = "https://" + u
	if not u.startswith(("http://", "https://")):
	u = "https://" + u
	return u.rstrip("/")

	# ------------------------ section helpers ------------------------

	SEC_RX = re.compile(r"\\section\{([^}]+)\}(.*?)(?=\\section\{\|\\end\{document\})", re.S)
	TWOCOL_RX = re.compile(r"\\begin\{twocolentry\}\{(.?)\}(.?)\\end\{twocolentry\}", re.S)
	HIGHLIGHTS_RX = re.compile(r"\\begin\{highlights\}(.*?)\\end\{highlights\}", re.S)
	ITEMIZE_RX = re.compile(r"\\begin\{itemize\}\[.?\](.?)\\end\{itemize\}", re.S)

	def section_map(tex):
	sections = {}
	for m in SEC_RX.finditer(tex):
	title = m.group(1).strip()
	body = m.group(2).strip()
	sections[title] = body
	return sections

	def extract_twocolentries(body):
	blocks = []
	for m in TWOCOL_RX.finditer(body):
	right = clean_tex(m.group(1))
	content = m.group(2)
	blocks.append((right, content))
	return blocks

	def extract_itemize_items(content):
	items = []
	im = HIGHLIGHTS_RX.search(content)
	if im:
	raw = im.group(1)
	items = [clean_tex(x) for x in re.findall(r"\\item\s+(.*)", raw)]
	else:
	im2 = ITEMIZE_RX.search(content)
	if im2:
	raw = im2.group(1)
	pairs = re.findall(r"\\item.?\]\s(.)\|\\item\s+(.)", raw)
	items = [a or b for (a, b) in pairs if (a or b)]
	items = [clean_tex(x) for x in items]
	return items

	# ------------------------ field parsers ------------------------

	def parse_contact(tex):
	# Prefer metadata like \hypersetup{pdfauthor={Your Name}}
	auth_m = re.search(r"pdfauthor=\{([^}]*)\}", tex)
	name = auth_m.group(1).strip() if auth_m else None

	# Otherwise search AFTER \begin{document} and skip template placeholder "#1"
	if not name:
	begin = tex.find(r"\begin{document}")
	region = tex[begin:] if begin != -1 else tex
	for m in re.finditer(r"\\textbf\{([^}]*)\}", region):
	cand = m.group(1).strip()
	if cand and cand != "#1":
	name = cand
	break

	if not name:
	name = "Rajiv Ranjan"

	email_m = re.search(r"mailto:([^\}]+)\}", tex)
	email = email_m.group(1) if email_m else "ranjan44@purdue.edu"

	return {
	"name": name,
	"email": email,
	"phone": "",
	"links": {
	"linkedin": DEFAULT_LINKEDIN,
	"scholar": "https://scholar.google.com/citations?user=4SEF19AAAAAJ&hl=en",
	"cv_pdf": "https://huggingface.co/spaces/hugging4rajiv/rajiv.r/resolve/main/Rajiv_Ranjan_CV.pdf"
	},
	"roles": ["Visiting Research Scholar @ Purdue University, USA"]
	}

	def parse_education(body):
	out = []
	for right, content in extract_twocolentries(body):
	m = re.search(r"\\item\[\s\\textbf\{([^}])\}\s\]\s(.*)", content)
	if not m:
	continue
	degree = clean_tex(m.group(1))
	rest = clean_tex(m.group(2))
	parts = [p.strip() for p in rest.split(",")]
	school = parts[0] if parts else ""
	field = parts[1] if len(parts) > 1 else ""
	notes = ", ".join(parts[2:]) if len(parts) > 2 else ""
	out.append({"degree": degree, "school": school, "field": field, "dates": clean_tex(right), "notes": notes})
	return out

	def parse_certifications(body):
	out = []
	for right, content in extract_twocolentries(body):
	items = extract_itemize_items(content)
	lines = [l for l in right.split("\n") if l.strip()]
	org = clean_tex(lines[0]) if lines else ""
	date_line = clean_tex(lines[-1]) if lines else ""
	out.append({"org": org, "dates": date_line, "items": items})
	return out

	def parse_skills(body):
	out = {}
	for m in re.finditer(r"\\item\[\s\\textbf\{([^}])\}\]\s(.)", body):
	cat = clean_tex(m.group(1))
	vals = [v.strip() for v in clean_tex(m.group(2)).split(",")]
	out[cat] = [v for v in vals if v]
	return out

	def parse_publications(body):
	out = []
	for right, content in extract_twocolentries(body):
	link_m = re.search(r"\\href\{([^}]*)\}", content)
	url = link_m.group(1).strip() if link_m else ""
	text = clean_tex(re.sub(r"\\href\{[^}]\}\{[^}]\}", "", content))
	title_line = text.split("\n")[0].strip()
	venue = clean_tex(right.split("\\\\")[0] if "\\\\" in right else right)
	date = clean_tex(right.split("\\\\")[-1]) if "\\\\" in right else ""
	out.append({"title": title_line, "venue": venue, "date": date, "url": url})
	return out

	def parse_experience(body):
	out = []
	for right, content in extract_twocolentries(body):
	org_role_m = re.search(r"\\textbf\{([^}])\}\s,?\s([^\n])", content)
	org = org_role_m.group(1).strip() if org_role_m else clean_tex(content[:50])
	role = org_role_m.group(2).strip() if org_role_m else ""
	items = extract_itemize_items(content)
	lines = [l for l in right.split("\n") if l.strip()]
	location = clean_tex(lines[0]) if lines else ""
	dates = clean_tex(lines[-1]) if lines else ""
	out.append({"org": org, "role": role, "location": location, "dates": dates, "highlights": items})
	return out

	def parse_projects(body):
	out = []
	for right, content in extract_twocolentries(body):
	link_m = re.search(r"\\href\{([^}]*)\}", right)
	url = link_m.group(1).strip() if link_m else ""
	if not url:
	link2 = re.search(r"\\href\{([^}]*)\}", content)
	url = link2.group(1).strip() if link2 else ""
	title_m = re.search(r"\\textbf\{([^}]*)\}", content)
	title = clean_tex(title_m.group(1)) if title_m else clean_tex(content[:60])
	items = extract_itemize_items(content)
	summary = items[0] if items else ""
	out.append({"title": title, "url": url, "summary": summary})
	return out

	def parse_coursework(body):
	out = []
	for right, content in extract_twocolentries(body):
	credit = clean_tex(right)
	credit = re.sub(r"(?i)\scredit[s]?\s$", "", credit).strip()
	items = extract_itemize_items(content)
	for it in items:
	out.append({"course": it, "credit": credit or ""})
	return out

	# ------------------------ main ------------------------

	def main():
	try:
	tex = load_tex(TEX_PATH)
	secs = section_map(tex)

	def get_sec(*candidates):
	for cand in candidates:
	for title, body in secs.items():
	if cand.lower() in title.lower():
	return body
	return ""

	data = {
	"contact": parse_contact(tex),
	"education": parse_education(secs.get("Education","") or get_sec("Education")),
	"certifications": parse_certifications(secs.get("Certifications","") or get_sec("Certifications")),
	"skills": parse_skills(secs.get("Skills","") or get_sec("Skills")),
	"publications": parse_publications(secs.get("Scholarly Articles","") or get_sec("Scholarly Articles","Publications")),
	"experience": parse_experience(secs.get("Experience","") or get_sec("Experience")),
	"projects": parse_projects(secs.get("Projects","") or get_sec("Projects")),
	"coursework": parse_coursework(secs.get("Relevant Coursework (Part of my PHD Course Credit)","") or get_sec("Coursework")),
	}
	pathlib.Path(OUT_JSON).write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
	print(f"Wrote {OUT_JSON}")
	except Exception as e:
	# Write a minimal JSON so the UI can still load
	minimal = {
	"contact": {
	"name": "Rajiv Ranjan",
	"email": "ranjan44@purdue.edu",
	"roles": ["Visiting Research Scholar @ Purdue University, USA"],
	"links": {"scholar": "", "cv_pdf": "", "linkedin": DEFAULT_LINKEDIN},
	},
	"skills": {}, "publications": [], "projects": [],
	"education": [], "experience": [], "certifications": [], "coursework": []
	}
	pathlib.Path(OUT_JSON).write_text(json.dumps(minimal, indent=2, ensure_ascii=False), encoding="utf-8")
	print("Parser fallback due to error:", type(e).__name__, str(e))
	print(traceback.format_exc())

	if __name__ == "__main__":
	main()