resumate / functions /linkedin_resume.py
gperdrizet's picture
Cleaned up LinkedIn resume PDF text extraction and parsing
f70c1ff verified
"""
context_acquisition.py
Functions for acquiring context from various sources including PDF text extraction,
GitHub profiles, and job posting text.
"""
import re
import logging
import io
import json
import unicodedata
from pathlib import Path
from datetime import datetime
import PyPDF2
from functions.helper import clean_text_whitespace
# pylint: disable=broad-exception-caught
def extract_text(pdf_file: str) -> dict:
"""
Extract and structure text content from an uploaded LinkedIn resume export PDF file
for optimal LLM processing.
Args:
pdf_file: The file path string to the uploaded PDF file
Returns:
dict: Dictionary containing extraction status, structured text content, and metadata
Example:
{
"contact_info": "...",
"summary": "...",
"skills": "...",
"experience": "...",
"education": "...",
"certifications": "...",
}
"""
logger = logging.getLogger(f'{__name__}.extract_text')
try:
# Read the PDF file from the file path
with open(pdf_file, 'rb') as file:
file_content = file.read()
# Create PDF reader from the file content
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
# Extract text from all pages
extracted_text = ""
num_pages = len(pdf_reader.pages)
logger.info("Extracting text from %d pages", num_pages)
for page_num in range(num_pages):
try:
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
extracted_text += page_text + "\n\n"
except Exception as e:
logger.warning("Error extracting text from page %d: %s", page_num + 1, str(e))
continue
logger.info("Extracted text length: %d characters", len(extracted_text))
# Clean and structure the extracted text for LLM consumption
structured_content = _parse_resume_text(extracted_text)
if not structured_content:
return None
logger.info("Found sections: %s", list(structured_content.keys()))
# Save results to JSON file
try:
linkedin_profile_dir = Path(__file__).parent.parent / "data" / "linkedin_profile"
linkedin_profile_dir.mkdir(parents=True, exist_ok=True)
# Create timestamped filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(structured_content, f, indent=2, ensure_ascii=False)
except Exception as save_error:
logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))
return structured_content
except Exception as e:
logger.error("Error processing PDF file: %s", str(e))
return None
def _parse_resume_text(text: str) -> dict:
"""
Parse resume text into logical sections for optimal LLM processing.
Args:
text (str): Raw extracted text from PDF
Returns:
dict: Structured text with sections, full text, and summary
"""
if not text:
return None
# Define section patterns (common LinkedIn export sections)
section_patterns = {
"contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
"summary": r"(?i)(summary|about|overview|profile)",
"skills": r"(?i)(skills|expertise|competencies|proficiencies)",
"experience": r"(?i)(experience|work|employment|professional)",
"education": r"(?i)(education|academic|university|college|school)",
"certifications": r"(?i)(certification|certificate|license)",
}
# Split text into lines for processing
lines = text.split('\n')
sections = {}
current_section = "general"
current_content = []
for line in lines:
line = line.strip()
if not line:
continue
# Check if line is a section header
section_found = None
for section_name, pattern in section_patterns.items():
if re.match(pattern, line):
section_found = section_name
break
if section_found:
# Save previous section content
if current_content:
sections[current_section] = '\n'.join(current_content)
# Start new section
current_section = section_found
current_content = [line]
else:
current_content.append(line)
# Save the last section
if current_content:
sections[current_section] = '\n'.join(current_content)
# Clean each section
for section_name, content in sections.items():
sections[section_name] = _clean_section(content)
return sections
def _clean_section(text: str) -> str:
"""
Clean a section of text by normalizing whitespace and removing unnecessary characters.
Args:
text (str): The text section to clean
Returns:
str: Cleaned text section
"""
# Normalize unicode characters to avoid issues with special characters
text = unicodedata.normalize('NFKC', text)
# Remove `Page n of n` added by linkedin export
text = re.sub(r'Page \d+ of \d+', '', text)
# Clean redundant whitespace
text = clean_text_whitespace(text)
return text.strip()