Spaces:
Configuration error
Configuration error
""" | |
context_acquisition.py | |
Functions for acquiring context from various sources including PDF text extraction, | |
GitHub profiles, and job posting text. | |
""" | |
import re | |
import logging | |
import io | |
import json | |
import unicodedata | |
from pathlib import Path | |
from datetime import datetime | |
import PyPDF2 | |
from functions.helper import clean_text_whitespace | |
# pylint: disable=broad-exception-caught | |
def extract_text(pdf_file: str) -> dict: | |
""" | |
Extract and structure text content from an uploaded LinkedIn resume export PDF file | |
for optimal LLM processing. | |
Args: | |
pdf_file: The file path string to the uploaded PDF file | |
Returns: | |
dict: Dictionary containing extraction status, structured text content, and metadata | |
Example: | |
{ | |
"contact_info": "...", | |
"summary": "...", | |
"skills": "...", | |
"experience": "...", | |
"education": "...", | |
"certifications": "...", | |
} | |
""" | |
logger = logging.getLogger(f'{__name__}.extract_text') | |
try: | |
# Read the PDF file from the file path | |
with open(pdf_file, 'rb') as file: | |
file_content = file.read() | |
# Create PDF reader from the file content | |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) | |
# Extract text from all pages | |
extracted_text = "" | |
num_pages = len(pdf_reader.pages) | |
logger.info("Extracting text from %d pages", num_pages) | |
for page_num in range(num_pages): | |
try: | |
page = pdf_reader.pages[page_num] | |
page_text = page.extract_text() | |
extracted_text += page_text + "\n\n" | |
except Exception as e: | |
logger.warning("Error extracting text from page %d: %s", page_num + 1, str(e)) | |
continue | |
logger.info("Extracted text length: %d characters", len(extracted_text)) | |
# Clean and structure the extracted text for LLM consumption | |
structured_content = _parse_resume_text(extracted_text) | |
if not structured_content: | |
return None | |
logger.info("Found sections: %s", list(structured_content.keys())) | |
# Save results to JSON file | |
try: | |
linkedin_profile_dir = Path(__file__).parent.parent / "data" / "linkedin_profile" | |
linkedin_profile_dir.mkdir(parents=True, exist_ok=True) | |
# Create timestamped filename | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json" | |
with open(output_file, 'w', encoding='utf-8') as f: | |
json.dump(structured_content, f, indent=2, ensure_ascii=False) | |
except Exception as save_error: | |
logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error)) | |
return structured_content | |
except Exception as e: | |
logger.error("Error processing PDF file: %s", str(e)) | |
return None | |
def _parse_resume_text(text: str) -> dict: | |
""" | |
Parse resume text into logical sections for optimal LLM processing. | |
Args: | |
text (str): Raw extracted text from PDF | |
Returns: | |
dict: Structured text with sections, full text, and summary | |
""" | |
if not text: | |
return None | |
# Define section patterns (common LinkedIn export sections) | |
section_patterns = { | |
"contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?", | |
"summary": r"(?i)(summary|about|overview|profile)", | |
"skills": r"(?i)(skills|expertise|competencies|proficiencies)", | |
"experience": r"(?i)(experience|work|employment|professional)", | |
"education": r"(?i)(education|academic|university|college|school)", | |
"certifications": r"(?i)(certification|certificate|license)", | |
} | |
# Split text into lines for processing | |
lines = text.split('\n') | |
sections = {} | |
current_section = "general" | |
current_content = [] | |
for line in lines: | |
line = line.strip() | |
if not line: | |
continue | |
# Check if line is a section header | |
section_found = None | |
for section_name, pattern in section_patterns.items(): | |
if re.match(pattern, line): | |
section_found = section_name | |
break | |
if section_found: | |
# Save previous section content | |
if current_content: | |
sections[current_section] = '\n'.join(current_content) | |
# Start new section | |
current_section = section_found | |
current_content = [line] | |
else: | |
current_content.append(line) | |
# Save the last section | |
if current_content: | |
sections[current_section] = '\n'.join(current_content) | |
# Clean each section | |
for section_name, content in sections.items(): | |
sections[section_name] = _clean_section(content) | |
return sections | |
def _clean_section(text: str) -> str: | |
""" | |
Clean a section of text by normalizing whitespace and removing unnecessary characters. | |
Args: | |
text (str): The text section to clean | |
Returns: | |
str: Cleaned text section | |
""" | |
# Normalize unicode characters to avoid issues with special characters | |
text = unicodedata.normalize('NFKC', text) | |
# Remove `Page n of n` added by linkedin export | |
text = re.sub(r'Page \d+ of \d+', '', text) | |
# Clean redundant whitespace | |
text = clean_text_whitespace(text) | |
return text.strip() | |