CVBuilder-ATS-Friendly / utils /parsing_utils.py
mkhekare's picture
Update utils/parsing_utils.py
ece7937 verified
import re
from io import BytesIO
from pdfminer.high_level import extract_text as extract_text_from_pdf
from docx import Document
import magic
def extract_text_from_file(uploaded_file):
"""Extract text from uploaded file (PDF, DOCX, or TXT)"""
file_content = uploaded_file.read()
mime = magic.Magic(mime=True)
file_type = mime.from_buffer(file_content)
if file_type == 'application/pdf':
with BytesIO(file_content) as pdf_file:
text = extract_text_from_pdf(pdf_file)
elif file_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
with BytesIO(file_content) as docx_file:
doc = Document(docx_file)
text = "\n".join([para.text for para in doc.paragraphs])
elif file_type == 'text/plain':
text = file_content.decode('utf-8')
else:
raise ValueError(f"Unsupported file type: {file_type}")
return text
def parse_cv_content(text):
data = {
'name': '',
'email': '',
'phone': '',
'linkedin': '',
'summary': '',
'skills': [],
'experience': [],
'education': [],
'projects': []
}
# Name extraction (first line with title case)
lines = [line.strip() for line in text.split('\n') if line.strip()]
if lines:
data['name'] = lines[0].title()
# Email extraction
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, text)
if emails:
data['email'] = emails[0]
# Phone extraction (international format)
phone_pattern = r'(\+?\d[\d\s\-\(\)]{7,}\d)'
phones = re.findall(phone_pattern, text)
if phones:
data['phone'] = phones[0].strip()
# LinkedIn URL extraction
linkedin_pattern = r'(https?://)?(www\.)?linkedin\.com/[a-zA-Z0-9\-\./]+'
linkedin = re.search(linkedin_pattern, text)
if linkedin:
data['linkedin'] = linkedin.group(0)
# Simple section parsing (improved)
current_section = None
for line in lines:
line_lower = line.lower()
if 'summary' in line_lower or 'objective' in line_lower:
current_section = 'summary'
data[current_section] = line.replace('Summary', '').replace('SUMMARY', '').strip()
elif 'experience' in line_lower or 'work history' in line_lower:
current_section = 'experience'
elif 'education' in line_lower:
current_section = 'education'
elif 'skills' in line_lower:
current_section = 'skills'
elif 'projects' in line_lower:
current_section = 'projects'
elif current_section:
if current_section == 'summary':
data[current_section] += ' ' + line
elif current_section == 'skills' and line.strip():
data['skills'].extend([s.strip() for s in line.split(',') if s.strip()])
elif current_section in ['experience', 'education', 'projects'] and line.strip():
if line.strip() and not line.strip().startswith(('β€’', '-', '*')):
# New entry
data[current_section].append({'title': line, 'description': []})
elif data[current_section] and line.strip():
# Continuation of previous entry
data[current_section][-1]['description'].append(line.strip('β€’-* '))
# Clean up summary
if 'summary' in data:
data['summary'] = ' '.join(data['summary'].split())
return data
def extract_section(text, start_pattern, end_pattern):
"""Extract a section between start and end patterns"""
start = re.search(start_pattern, text, re.IGNORECASE)
if not start:
return None
remaining_text = text[start.end():]
end = re.search(end_pattern, remaining_text, re.IGNORECASE)
if end:
return remaining_text[:end.start()].strip()
return remaining_text.strip()
def extract_dates(text):
"""Extract dates from text (simple pattern matching)"""
# Common date patterns
patterns = [
r'(\w+\s?\d{4})\s?[-–—]\s?(\w+\s?\d{4}|Present|Current)', # Jan 2020 - Dec 2022
r'(\d{4})\s?[-–—]\s?(\d{4}|Present|Current)', # 2020 - 2022
r'(\w+\s?\d{4})', # Jan 2020
r'(\d{4})' # 2020
]
for pattern in patterns:
matches = re.findall(pattern, text)
if matches:
if isinstance(matches[0], tuple):
return [m.strip() for m in matches[0]]
return [matches[0].strip()]
return []