Spaces:
Running
Running
| import re, json, pathlib, traceback | |
| # --- robust paths relative to this file (works on Spaces & local) --- | |
| BASE = pathlib.Path(__file__).parent.resolve() | |
| TEX_PATH = str((BASE / "cv.tex").resolve()) | |
| OUT_JSON = str((BASE / "cv.json").resolve()) | |
| DEFAULT_LINKEDIN = "https://www.linkedin.com/in/rajiv-ranjan-jha-248a59103" | |
| # ------------------------ utilities ------------------------ | |
| def safe_sub(pattern, repl, s): | |
| try: | |
| return re.sub(pattern, repl, s) | |
| except re.error: | |
| # leave text unchanged if a pattern trips Python's regex engine | |
| return s | |
| def load_tex(path): | |
| text = pathlib.Path(path).read_text(encoding="utf-8", errors="ignore") | |
| # collapse tabs/multi-spaces so regex is simpler | |
| return re.sub(r"[ \t]+", " ", text) | |
| def clean_tex(s): | |
| s = s or "" | |
| s = s.replace("~", " ") | |
| s = safe_sub(r"\\textbf\{([^}]*)\}", r"\1", s) | |
| s = safe_sub(r"\\textit\{([^}]*)\}", r"\1", s) | |
| s = safe_sub(r"\\&", "&", s) | |
| s = safe_sub(r"\\%", "%", s) | |
| s = safe_sub(r"\\_", "_", s) | |
| s = safe_sub(r"\\textsuperscript\{[^}]*\}", "", s) | |
| s = safe_sub(r"\\href\{([^}]*)\}\{([^}]*)\}", r"\2 (\1)", s) | |
| s = safe_sub(r"\\mbox\{([^}]*)\}", r"\1", s) | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def _normalize_url(u: str) -> str: | |
| if not u: | |
| return DEFAULT_LINKEDIN | |
| u = u.strip() | |
| if u.startswith("www."): | |
| u = "https://" + u | |
| elif u.startswith("linkedin.com"): | |
| u = "https://" + u | |
| if not u.startswith(("http://", "https://")): | |
| u = "https://" + u | |
| return u.rstrip("/") | |
| # ------------------------ section helpers ------------------------ | |
| SEC_RX = re.compile(r"\\section\{([^}]+)\}(.*?)(?=\\section\{|\\end\{document\})", re.S) | |
| TWOCOL_RX = re.compile(r"\\begin\{twocolentry\}\{(.*?)\}(.*?)\\end\{twocolentry\}", re.S) | |
| HIGHLIGHTS_RX = re.compile(r"\\begin\{highlights\}(.*?)\\end\{highlights\}", re.S) | |
| ITEMIZE_RX = re.compile(r"\\begin\{itemize\}\[.*?\](.*?)\\end\{itemize\}", re.S) | |
| def section_map(tex): | |
| sections = {} | |
| for m in SEC_RX.finditer(tex): | |
| title = m.group(1).strip() | |
| body = m.group(2).strip() | |
| sections[title] = body | |
| return sections | |
| def extract_twocolentries(body): | |
| blocks = [] | |
| for m in TWOCOL_RX.finditer(body): | |
| right = clean_tex(m.group(1)) | |
| content = m.group(2) | |
| blocks.append((right, content)) | |
| return blocks | |
| def extract_itemize_items(content): | |
| items = [] | |
| im = HIGHLIGHTS_RX.search(content) | |
| if im: | |
| raw = im.group(1) | |
| items = [clean_tex(x) for x in re.findall(r"\\item\s+(.*)", raw)] | |
| else: | |
| im2 = ITEMIZE_RX.search(content) | |
| if im2: | |
| raw = im2.group(1) | |
| pairs = re.findall(r"\\item.*?\]\s*(.*)|\\item\s+(.*)", raw) | |
| items = [a or b for (a, b) in pairs if (a or b)] | |
| items = [clean_tex(x) for x in items] | |
| return items | |
| # ------------------------ field parsers ------------------------ | |
| def parse_contact(tex): | |
| # Prefer metadata like \hypersetup{pdfauthor={Your Name}} | |
| auth_m = re.search(r"pdfauthor=\{([^}]*)\}", tex) | |
| name = auth_m.group(1).strip() if auth_m else None | |
| # Otherwise search AFTER \begin{document} and skip template placeholder "#1" | |
| if not name: | |
| begin = tex.find(r"\begin{document}") | |
| region = tex[begin:] if begin != -1 else tex | |
| for m in re.finditer(r"\\textbf\{([^}]*)\}", region): | |
| cand = m.group(1).strip() | |
| if cand and cand != "#1": | |
| name = cand | |
| break | |
| if not name: | |
| name = "Rajiv Ranjan" | |
| email_m = re.search(r"mailto:([^\}]+)\}", tex) | |
| email = email_m.group(1) if email_m else "ranjan44@purdue.edu" | |
| return { | |
| "name": name, | |
| "email": email, | |
| "phone": "", | |
| "links": { | |
| "linkedin": DEFAULT_LINKEDIN, | |
| "scholar": "https://scholar.google.com/citations?user=4SEF19AAAAAJ&hl=en", | |
| "cv_pdf": "https://huggingface.co/spaces/hugging4rajiv/rajiv.r/resolve/main/Rajiv_Ranjan_CV.pdf" | |
| }, | |
| "roles": ["Visiting Research Scholar @ Purdue University, USA"] | |
| } | |
| def parse_education(body): | |
| out = [] | |
| for right, content in extract_twocolentries(body): | |
| m = re.search(r"\\item\[\s*\\textbf\{([^}]*)\}\s*\]\s*(.*)", content) | |
| if not m: | |
| continue | |
| degree = clean_tex(m.group(1)) | |
| rest = clean_tex(m.group(2)) | |
| parts = [p.strip() for p in rest.split(",")] | |
| school = parts[0] if parts else "" | |
| field = parts[1] if len(parts) > 1 else "" | |
| notes = ", ".join(parts[2:]) if len(parts) > 2 else "" | |
| out.append({"degree": degree, "school": school, "field": field, "dates": clean_tex(right), "notes": notes}) | |
| return out | |
| def parse_certifications(body): | |
| out = [] | |
| for right, content in extract_twocolentries(body): | |
| items = extract_itemize_items(content) | |
| lines = [l for l in right.split("\n") if l.strip()] | |
| org = clean_tex(lines[0]) if lines else "" | |
| date_line = clean_tex(lines[-1]) if lines else "" | |
| out.append({"org": org, "dates": date_line, "items": items}) | |
| return out | |
| def parse_skills(body): | |
| out = {} | |
| for m in re.finditer(r"\\item\[\s*\\textbf\{([^}]*)\}\]\s*(.*)", body): | |
| cat = clean_tex(m.group(1)) | |
| vals = [v.strip() for v in clean_tex(m.group(2)).split(",")] | |
| out[cat] = [v for v in vals if v] | |
| return out | |
| def parse_publications(body): | |
| out = [] | |
| for right, content in extract_twocolentries(body): | |
| link_m = re.search(r"\\href\{([^}]*)\}", content) | |
| url = link_m.group(1).strip() if link_m else "" | |
| text = clean_tex(re.sub(r"\\href\{[^}]*\}\{[^}]*\}", "", content)) | |
| title_line = text.split("\n")[0].strip() | |
| venue = clean_tex(right.split("\\\\")[0] if "\\\\" in right else right) | |
| date = clean_tex(right.split("\\\\")[-1]) if "\\\\" in right else "" | |
| out.append({"title": title_line, "venue": venue, "date": date, "url": url}) | |
| return out | |
| def parse_experience(body): | |
| out = [] | |
| for right, content in extract_twocolentries(body): | |
| org_role_m = re.search(r"\\textbf\{([^}]*)\}\s*,?\s*([^\n]*)", content) | |
| org = org_role_m.group(1).strip() if org_role_m else clean_tex(content[:50]) | |
| role = org_role_m.group(2).strip() if org_role_m else "" | |
| items = extract_itemize_items(content) | |
| lines = [l for l in right.split("\n") if l.strip()] | |
| location = clean_tex(lines[0]) if lines else "" | |
| dates = clean_tex(lines[-1]) if lines else "" | |
| out.append({"org": org, "role": role, "location": location, "dates": dates, "highlights": items}) | |
| return out | |
| def parse_projects(body): | |
| out = [] | |
| for right, content in extract_twocolentries(body): | |
| link_m = re.search(r"\\href\{([^}]*)\}", right) | |
| url = link_m.group(1).strip() if link_m else "" | |
| if not url: | |
| link2 = re.search(r"\\href\{([^}]*)\}", content) | |
| url = link2.group(1).strip() if link2 else "" | |
| title_m = re.search(r"\\textbf\{([^}]*)\}", content) | |
| title = clean_tex(title_m.group(1)) if title_m else clean_tex(content[:60]) | |
| items = extract_itemize_items(content) | |
| summary = items[0] if items else "" | |
| out.append({"title": title, "url": url, "summary": summary}) | |
| return out | |
| def parse_coursework(body): | |
| out = [] | |
| for right, content in extract_twocolentries(body): | |
| credit = clean_tex(right) | |
| credit = re.sub(r"(?i)\s*credit[s]?\s*$", "", credit).strip() | |
| items = extract_itemize_items(content) | |
| for it in items: | |
| out.append({"course": it, "credit": credit or ""}) | |
| return out | |
| # ------------------------ main ------------------------ | |
| def main(): | |
| try: | |
| tex = load_tex(TEX_PATH) | |
| secs = section_map(tex) | |
| def get_sec(*candidates): | |
| for cand in candidates: | |
| for title, body in secs.items(): | |
| if cand.lower() in title.lower(): | |
| return body | |
| return "" | |
| data = { | |
| "contact": parse_contact(tex), | |
| "education": parse_education(secs.get("Education","") or get_sec("Education")), | |
| "certifications": parse_certifications(secs.get("Certifications","") or get_sec("Certifications")), | |
| "skills": parse_skills(secs.get("Skills","") or get_sec("Skills")), | |
| "publications": parse_publications(secs.get("Scholarly Articles","") or get_sec("Scholarly Articles","Publications")), | |
| "experience": parse_experience(secs.get("Experience","") or get_sec("Experience")), | |
| "projects": parse_projects(secs.get("Projects","") or get_sec("Projects")), | |
| "coursework": parse_coursework(secs.get("Relevant Coursework (Part of my PHD Course Credit)","") or get_sec("Coursework")), | |
| } | |
| pathlib.Path(OUT_JSON).write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") | |
| print(f"Wrote {OUT_JSON}") | |
| except Exception as e: | |
| # Write a minimal JSON so the UI can still load | |
| minimal = { | |
| "contact": { | |
| "name": "Rajiv Ranjan", | |
| "email": "ranjan44@purdue.edu", | |
| "roles": ["Visiting Research Scholar @ Purdue University, USA"], | |
| "links": {"scholar": "", "cv_pdf": "", "linkedin": DEFAULT_LINKEDIN}, | |
| }, | |
| "skills": {}, "publications": [], "projects": [], | |
| "education": [], "experience": [], "certifications": [], "coursework": [] | |
| } | |
| pathlib.Path(OUT_JSON).write_text(json.dumps(minimal, indent=2, ensure_ascii=False), encoding="utf-8") | |
| print("Parser fallback due to error:", type(e).__name__, str(e)) | |
| print(traceback.format_exc()) | |
| if __name__ == "__main__": | |
| main() | |