""" context_acquisition.py Functions for acquiring context from various sources including PDF text extraction, GitHub profiles, and job posting text. """ import re import logging import io import json import unicodedata from pathlib import Path from datetime import datetime import PyPDF2 from functions.helper import clean_text_whitespace # pylint: disable=broad-exception-caught def extract_text(pdf_file: str) -> dict: """ Extract and structure text content from an uploaded LinkedIn resume export PDF file for optimal LLM processing. Args: pdf_file: The file path string to the uploaded PDF file Returns: dict: Dictionary containing extraction status, structured text content, and metadata Example: { "contact_info": "...", "summary": "...", "skills": "...", "experience": "...", "education": "...", "certifications": "...", } """ logger = logging.getLogger(f'{__name__}.extract_text') try: # Read the PDF file from the file path with open(pdf_file, 'rb') as file: file_content = file.read() # Create PDF reader from the file content pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) # Extract text from all pages extracted_text = "" num_pages = len(pdf_reader.pages) logger.info("Extracting text from %d pages", num_pages) for page_num in range(num_pages): try: page = pdf_reader.pages[page_num] page_text = page.extract_text() extracted_text += page_text + "\n\n" except Exception as e: logger.warning("Error extracting text from page %d: %s", page_num + 1, str(e)) continue logger.info("Extracted text length: %d characters", len(extracted_text)) # Clean and structure the extracted text for LLM consumption structured_content = _parse_resume_text(extracted_text) if not structured_content: return None logger.info("Found sections: %s", list(structured_content.keys())) # Save results to JSON file try: linkedin_profile_dir = Path(__file__).parent.parent / "data" / "linkedin_profile" linkedin_profile_dir.mkdir(parents=True, exist_ok=True) # Create timestamped filename timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(structured_content, f, indent=2, ensure_ascii=False) except Exception as save_error: logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error)) return structured_content except Exception as e: logger.error("Error processing PDF file: %s", str(e)) return None def _parse_resume_text(text: str) -> dict: """ Parse resume text into logical sections for optimal LLM processing. Args: text (str): Raw extracted text from PDF Returns: dict: Structured text with sections, full text, and summary """ if not text: return None # Define section patterns (common LinkedIn export sections) section_patterns = { "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?", "summary": r"(?i)(summary|about|overview|profile)", "skills": r"(?i)(skills|expertise|competencies|proficiencies)", "experience": r"(?i)(experience|work|employment|professional)", "education": r"(?i)(education|academic|university|college|school)", "certifications": r"(?i)(certification|certificate|license)", } # Split text into lines for processing lines = text.split('\n') sections = {} current_section = "general" current_content = [] for line in lines: line = line.strip() if not line: continue # Check if line is a section header section_found = None for section_name, pattern in section_patterns.items(): if re.match(pattern, line): section_found = section_name break if section_found: # Save previous section content if current_content: sections[current_section] = '\n'.join(current_content) # Start new section current_section = section_found current_content = [line] else: current_content.append(line) # Save the last section if current_content: sections[current_section] = '\n'.join(current_content) # Clean each section for section_name, content in sections.items(): sections[section_name] = _clean_section(content) return sections def _clean_section(text: str) -> str: """ Clean a section of text by normalizing whitespace and removing unnecessary characters. Args: text (str): The text section to clean Returns: str: Cleaned text section """ # Normalize unicode characters to avoid issues with special characters text = unicodedata.normalize('NFKC', text) # Remove `Page n of n` added by linkedin export text = re.sub(r'Page \d+ of \d+', '', text) # Clean redundant whitespace text = clean_text_whitespace(text) return text.strip()