Spaces:

gperdrizet
/

resumate

Configuration error

App Files Files Community

resumate / functions /linkedin_resume.py

gperdrizet

Cleaned up LinkedIn resume PDF text extraction and parsing

f70c1ff verified about 1 month ago

raw

history blame contribute delete

5.54 kB

	"""
	context_acquisition.py

	Functions for acquiring context from various sources including PDF text extraction,
	GitHub profiles, and job posting text.
	"""

	import re
	import logging
	import io
	import json
	import unicodedata
	from pathlib import Path
	from datetime import datetime
	import PyPDF2

	from functions.helper import clean_text_whitespace

	# pylint: disable=broad-exception-caught


	def extract_text(pdf_file: str) -> dict:
	"""
	Extract and structure text content from an uploaded LinkedIn resume export PDF file
	for optimal LLM processing.

	Args:
	pdf_file: The file path string to the uploaded PDF file

	Returns:
	dict: Dictionary containing extraction status, structured text content, and metadata

	Example:
	{
	"contact_info": "...",
	"summary": "...",
	"skills": "...",
	"experience": "...",
	"education": "...",
	"certifications": "...",
	}
	"""

	logger = logging.getLogger(f'{__name__}.extract_text')

	try:

	# Read the PDF file from the file path
	with open(pdf_file, 'rb') as file:
	file_content = file.read()

	# Create PDF reader from the file content
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))

	# Extract text from all pages
	extracted_text = ""
	num_pages = len(pdf_reader.pages)
	logger.info("Extracting text from %d pages", num_pages)

	for page_num in range(num_pages):
	try:
	page = pdf_reader.pages[page_num]
	page_text = page.extract_text()
	extracted_text += page_text + "\n\n"

	except Exception as e:
	logger.warning("Error extracting text from page %d: %s", page_num + 1, str(e))

	continue

	logger.info("Extracted text length: %d characters", len(extracted_text))

	# Clean and structure the extracted text for LLM consumption
	structured_content = _parse_resume_text(extracted_text)

	if not structured_content:
	return None

	logger.info("Found sections: %s", list(structured_content.keys()))

	# Save results to JSON file
	try:
	linkedin_profile_dir = Path(__file__).parent.parent / "data" / "linkedin_profile"
	linkedin_profile_dir.mkdir(parents=True, exist_ok=True)

	# Create timestamped filename
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json"

	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(structured_content, f, indent=2, ensure_ascii=False)

	except Exception as save_error:
	logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))

	return structured_content

	except Exception as e:
	logger.error("Error processing PDF file: %s", str(e))

	return None


	def _parse_resume_text(text: str) -> dict:
	"""
	Parse resume text into logical sections for optimal LLM processing.

	Args:
	text (str): Raw extracted text from PDF

	Returns:
	dict: Structured text with sections, full text, and summary
	"""
	if not text:
	return None

	# Define section patterns (common LinkedIn export sections)
	section_patterns = {
	"contact_info": r"(?i)(contact\|personal\|profile)\s*(?:information)?",
	"summary": r"(?i)(summary\|about\|overview\|profile)",
	"skills": r"(?i)(skills\|expertise\|competencies\|proficiencies)",
	"experience": r"(?i)(experience\|work\|employment\|professional)",
	"education": r"(?i)(education\|academic\|university\|college\|school)",
	"certifications": r"(?i)(certification\|certificate\|license)",
	}

	# Split text into lines for processing
	lines = text.split('\n')
	sections = {}
	current_section = "general"
	current_content = []

	for line in lines:
	line = line.strip()

	if not line:
	continue

	# Check if line is a section header
	section_found = None

	for section_name, pattern in section_patterns.items():
	if re.match(pattern, line):

	section_found = section_name
	break

	if section_found:

	# Save previous section content
	if current_content:
	sections[current_section] = '\n'.join(current_content)

	# Start new section
	current_section = section_found
	current_content = [line]

	else:
	current_content.append(line)

	# Save the last section
	if current_content:
	sections[current_section] = '\n'.join(current_content)

	# Clean each section
	for section_name, content in sections.items():
	sections[section_name] = _clean_section(content)

	return sections


	def _clean_section(text: str) -> str:
	"""
	Clean a section of text by normalizing whitespace and removing unnecessary characters.

	Args:
	text (str): The text section to clean

	Returns:
	str: Cleaned text section
	"""

	# Normalize unicode characters to avoid issues with special characters
	text = unicodedata.normalize('NFKC', text)

	# Remove `Page n of n` added by linkedin export
	text = re.sub(r'Page \d+ of \d+', '', text)

	# Clean redundant whitespace
	text = clean_text_whitespace(text)

	return text.strip()