Spaces:

mkhekare
/

CVBuilder-ATS-Friendly

Sleeping

App Files Files Community

CVBuilder-ATS-Friendly / utils /parsing_utils.py

mkhekare

Update utils/parsing_utils.py

ece7937 verified 4 months ago

raw

history blame contribute delete

4.74 kB

	import re
	from io import BytesIO
	from pdfminer.high_level import extract_text as extract_text_from_pdf
	from docx import Document
	import magic

	def extract_text_from_file(uploaded_file):
	"""Extract text from uploaded file (PDF, DOCX, or TXT)"""
	file_content = uploaded_file.read()
	mime = magic.Magic(mime=True)
	file_type = mime.from_buffer(file_content)

	if file_type == 'application/pdf':
	with BytesIO(file_content) as pdf_file:
	text = extract_text_from_pdf(pdf_file)
	elif file_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
	with BytesIO(file_content) as docx_file:
	doc = Document(docx_file)
	text = "\n".join([para.text for para in doc.paragraphs])
	elif file_type == 'text/plain':
	text = file_content.decode('utf-8')
	else:
	raise ValueError(f"Unsupported file type: {file_type}")

	return text

	def parse_cv_content(text):
	data = {
	'name': '',
	'email': '',
	'phone': '',
	'linkedin': '',
	'summary': '',
	'skills': [],
	'experience': [],
	'education': [],
	'projects': []
	}

	# Name extraction (first line with title case)
	lines = [line.strip() for line in text.split('\n') if line.strip()]
	if lines:
	data['name'] = lines[0].title()

	# Email extraction
	email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b'
	emails = re.findall(email_pattern, text)
	if emails:
	data['email'] = emails[0]

	# Phone extraction (international format)
	phone_pattern = r'(\+?\d[\d\s\-\(\)]{7,}\d)'
	phones = re.findall(phone_pattern, text)
	if phones:
	data['phone'] = phones[0].strip()

	# LinkedIn URL extraction
	linkedin_pattern = r'(https?://)?(www\.)?linkedin\.com/[a-zA-Z0-9\-\./]+'
	linkedin = re.search(linkedin_pattern, text)
	if linkedin:
	data['linkedin'] = linkedin.group(0)

	# Simple section parsing (improved)
	current_section = None
	for line in lines:
	line_lower = line.lower()
	if 'summary' in line_lower or 'objective' in line_lower:
	current_section = 'summary'
	data[current_section] = line.replace('Summary', '').replace('SUMMARY', '').strip()
	elif 'experience' in line_lower or 'work history' in line_lower:
	current_section = 'experience'
	elif 'education' in line_lower:
	current_section = 'education'
	elif 'skills' in line_lower:
	current_section = 'skills'
	elif 'projects' in line_lower:
	current_section = 'projects'
	elif current_section:
	if current_section == 'summary':
	data[current_section] += ' ' + line
	elif current_section == 'skills' and line.strip():
	data['skills'].extend([s.strip() for s in line.split(',') if s.strip()])
	elif current_section in ['experience', 'education', 'projects'] and line.strip():
	if line.strip() and not line.strip().startswith(('•', '-', '*')):
	# New entry
	data[current_section].append({'title': line, 'description': []})
	elif data[current_section] and line.strip():
	# Continuation of previous entry
	data[current_section][-1]['description'].append(line.strip('•-* '))

	# Clean up summary
	if 'summary' in data:
	data['summary'] = ' '.join(data['summary'].split())

	return data

	def extract_section(text, start_pattern, end_pattern):
	"""Extract a section between start and end patterns"""
	start = re.search(start_pattern, text, re.IGNORECASE)
	if not start:
	return None

	remaining_text = text[start.end():]
	end = re.search(end_pattern, remaining_text, re.IGNORECASE)

	if end:
	return remaining_text[:end.start()].strip()
	return remaining_text.strip()

	def extract_dates(text):
	"""Extract dates from text (simple pattern matching)"""
	# Common date patterns
	patterns = [
	r'(\w+\s?\d{4})\s?[-–—]\s?(\w+\s?\d{4}\|Present\|Current)', # Jan 2020 - Dec 2022
	r'(\d{4})\s?[-–—]\s?(\d{4}\|Present\|Current)', # 2020 - 2022
	r'(\w+\s?\d{4})', # Jan 2020
	r'(\d{4})' # 2020
	]

	for pattern in patterns:
	matches = re.findall(pattern, text)
	if matches:
	if isinstance(matches[0], tuple):
	return [m.strip() for m in matches[0]]
	return [matches[0].strip()]

	return []