rajiv.r / parse_cv_to_json.py
hugging4rajiv's picture
Update parse_cv_to_json.py
0cb5582 verified
import re, json, pathlib, traceback
# --- robust paths relative to this file (works on Spaces & local) ---
BASE = pathlib.Path(__file__).parent.resolve()
TEX_PATH = str((BASE / "cv.tex").resolve())
OUT_JSON = str((BASE / "cv.json").resolve())
DEFAULT_LINKEDIN = "https://www.linkedin.com/in/rajiv-ranjan-jha-248a59103"
# ------------------------ utilities ------------------------
def safe_sub(pattern, repl, s):
try:
return re.sub(pattern, repl, s)
except re.error:
# leave text unchanged if a pattern trips Python's regex engine
return s
def load_tex(path):
text = pathlib.Path(path).read_text(encoding="utf-8", errors="ignore")
# collapse tabs/multi-spaces so regex is simpler
return re.sub(r"[ \t]+", " ", text)
def clean_tex(s):
s = s or ""
s = s.replace("~", " ")
s = safe_sub(r"\\textbf\{([^}]*)\}", r"\1", s)
s = safe_sub(r"\\textit\{([^}]*)\}", r"\1", s)
s = safe_sub(r"\\&", "&", s)
s = safe_sub(r"\\%", "%", s)
s = safe_sub(r"\\_", "_", s)
s = safe_sub(r"\\textsuperscript\{[^}]*\}", "", s)
s = safe_sub(r"\\href\{([^}]*)\}\{([^}]*)\}", r"\2 (\1)", s)
s = safe_sub(r"\\mbox\{([^}]*)\}", r"\1", s)
s = re.sub(r"\s+", " ", s).strip()
return s
def _normalize_url(u: str) -> str:
if not u:
return DEFAULT_LINKEDIN
u = u.strip()
if u.startswith("www."):
u = "https://" + u
elif u.startswith("linkedin.com"):
u = "https://" + u
if not u.startswith(("http://", "https://")):
u = "https://" + u
return u.rstrip("/")
# ------------------------ section helpers ------------------------
SEC_RX = re.compile(r"\\section\{([^}]+)\}(.*?)(?=\\section\{|\\end\{document\})", re.S)
TWOCOL_RX = re.compile(r"\\begin\{twocolentry\}\{(.*?)\}(.*?)\\end\{twocolentry\}", re.S)
HIGHLIGHTS_RX = re.compile(r"\\begin\{highlights\}(.*?)\\end\{highlights\}", re.S)
ITEMIZE_RX = re.compile(r"\\begin\{itemize\}\[.*?\](.*?)\\end\{itemize\}", re.S)
def section_map(tex):
sections = {}
for m in SEC_RX.finditer(tex):
title = m.group(1).strip()
body = m.group(2).strip()
sections[title] = body
return sections
def extract_twocolentries(body):
blocks = []
for m in TWOCOL_RX.finditer(body):
right = clean_tex(m.group(1))
content = m.group(2)
blocks.append((right, content))
return blocks
def extract_itemize_items(content):
items = []
im = HIGHLIGHTS_RX.search(content)
if im:
raw = im.group(1)
items = [clean_tex(x) for x in re.findall(r"\\item\s+(.*)", raw)]
else:
im2 = ITEMIZE_RX.search(content)
if im2:
raw = im2.group(1)
pairs = re.findall(r"\\item.*?\]\s*(.*)|\\item\s+(.*)", raw)
items = [a or b for (a, b) in pairs if (a or b)]
items = [clean_tex(x) for x in items]
return items
# ------------------------ field parsers ------------------------
def parse_contact(tex):
# Prefer metadata like \hypersetup{pdfauthor={Your Name}}
auth_m = re.search(r"pdfauthor=\{([^}]*)\}", tex)
name = auth_m.group(1).strip() if auth_m else None
# Otherwise search AFTER \begin{document} and skip template placeholder "#1"
if not name:
begin = tex.find(r"\begin{document}")
region = tex[begin:] if begin != -1 else tex
for m in re.finditer(r"\\textbf\{([^}]*)\}", region):
cand = m.group(1).strip()
if cand and cand != "#1":
name = cand
break
if not name:
name = "Rajiv Ranjan"
email_m = re.search(r"mailto:([^\}]+)\}", tex)
email = email_m.group(1) if email_m else "ranjan44@purdue.edu"
return {
"name": name,
"email": email,
"phone": "",
"links": {
"linkedin": DEFAULT_LINKEDIN,
"scholar": "https://scholar.google.com/citations?user=4SEF19AAAAAJ&hl=en",
"cv_pdf": "https://huggingface.co/spaces/hugging4rajiv/rajiv.r/resolve/main/Rajiv_Ranjan_CV.pdf"
},
"roles": ["Visiting Research Scholar @ Purdue University, USA"]
}
def parse_education(body):
out = []
for right, content in extract_twocolentries(body):
m = re.search(r"\\item\[\s*\\textbf\{([^}]*)\}\s*\]\s*(.*)", content)
if not m:
continue
degree = clean_tex(m.group(1))
rest = clean_tex(m.group(2))
parts = [p.strip() for p in rest.split(",")]
school = parts[0] if parts else ""
field = parts[1] if len(parts) > 1 else ""
notes = ", ".join(parts[2:]) if len(parts) > 2 else ""
out.append({"degree": degree, "school": school, "field": field, "dates": clean_tex(right), "notes": notes})
return out
def parse_certifications(body):
out = []
for right, content in extract_twocolentries(body):
items = extract_itemize_items(content)
lines = [l for l in right.split("\n") if l.strip()]
org = clean_tex(lines[0]) if lines else ""
date_line = clean_tex(lines[-1]) if lines else ""
out.append({"org": org, "dates": date_line, "items": items})
return out
def parse_skills(body):
out = {}
for m in re.finditer(r"\\item\[\s*\\textbf\{([^}]*)\}\]\s*(.*)", body):
cat = clean_tex(m.group(1))
vals = [v.strip() for v in clean_tex(m.group(2)).split(",")]
out[cat] = [v for v in vals if v]
return out
def parse_publications(body):
out = []
for right, content in extract_twocolentries(body):
link_m = re.search(r"\\href\{([^}]*)\}", content)
url = link_m.group(1).strip() if link_m else ""
text = clean_tex(re.sub(r"\\href\{[^}]*\}\{[^}]*\}", "", content))
title_line = text.split("\n")[0].strip()
venue = clean_tex(right.split("\\\\")[0] if "\\\\" in right else right)
date = clean_tex(right.split("\\\\")[-1]) if "\\\\" in right else ""
out.append({"title": title_line, "venue": venue, "date": date, "url": url})
return out
def parse_experience(body):
out = []
for right, content in extract_twocolentries(body):
org_role_m = re.search(r"\\textbf\{([^}]*)\}\s*,?\s*([^\n]*)", content)
org = org_role_m.group(1).strip() if org_role_m else clean_tex(content[:50])
role = org_role_m.group(2).strip() if org_role_m else ""
items = extract_itemize_items(content)
lines = [l for l in right.split("\n") if l.strip()]
location = clean_tex(lines[0]) if lines else ""
dates = clean_tex(lines[-1]) if lines else ""
out.append({"org": org, "role": role, "location": location, "dates": dates, "highlights": items})
return out
def parse_projects(body):
out = []
for right, content in extract_twocolentries(body):
link_m = re.search(r"\\href\{([^}]*)\}", right)
url = link_m.group(1).strip() if link_m else ""
if not url:
link2 = re.search(r"\\href\{([^}]*)\}", content)
url = link2.group(1).strip() if link2 else ""
title_m = re.search(r"\\textbf\{([^}]*)\}", content)
title = clean_tex(title_m.group(1)) if title_m else clean_tex(content[:60])
items = extract_itemize_items(content)
summary = items[0] if items else ""
out.append({"title": title, "url": url, "summary": summary})
return out
def parse_coursework(body):
out = []
for right, content in extract_twocolentries(body):
credit = clean_tex(right)
credit = re.sub(r"(?i)\s*credit[s]?\s*$", "", credit).strip()
items = extract_itemize_items(content)
for it in items:
out.append({"course": it, "credit": credit or ""})
return out
# ------------------------ main ------------------------
def main():
try:
tex = load_tex(TEX_PATH)
secs = section_map(tex)
def get_sec(*candidates):
for cand in candidates:
for title, body in secs.items():
if cand.lower() in title.lower():
return body
return ""
data = {
"contact": parse_contact(tex),
"education": parse_education(secs.get("Education","") or get_sec("Education")),
"certifications": parse_certifications(secs.get("Certifications","") or get_sec("Certifications")),
"skills": parse_skills(secs.get("Skills","") or get_sec("Skills")),
"publications": parse_publications(secs.get("Scholarly Articles","") or get_sec("Scholarly Articles","Publications")),
"experience": parse_experience(secs.get("Experience","") or get_sec("Experience")),
"projects": parse_projects(secs.get("Projects","") or get_sec("Projects")),
"coursework": parse_coursework(secs.get("Relevant Coursework (Part of my PHD Course Credit)","") or get_sec("Coursework")),
}
pathlib.Path(OUT_JSON).write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"Wrote {OUT_JSON}")
except Exception as e:
# Write a minimal JSON so the UI can still load
minimal = {
"contact": {
"name": "Rajiv Ranjan",
"email": "ranjan44@purdue.edu",
"roles": ["Visiting Research Scholar @ Purdue University, USA"],
"links": {"scholar": "", "cv_pdf": "", "linkedin": DEFAULT_LINKEDIN},
},
"skills": {}, "publications": [], "projects": [],
"education": [], "experience": [], "certifications": [], "coursework": []
}
pathlib.Path(OUT_JSON).write_text(json.dumps(minimal, indent=2, ensure_ascii=False), encoding="utf-8")
print("Parser fallback due to error:", type(e).__name__, str(e))
print(traceback.format_exc())
if __name__ == "__main__":
main()