Spaces:
Sleeping
Sleeping
Update utils/parsing_utils.py
Browse files- utils/parsing_utils.py +37 -81
utils/parsing_utils.py
CHANGED
|
@@ -25,8 +25,6 @@ def extract_text_from_file(uploaded_file):
|
|
| 25 |
return text
|
| 26 |
|
| 27 |
def parse_cv_content(text):
|
| 28 |
-
"""Parse extracted text into structured CV data"""
|
| 29 |
-
# Basic parsing - this can be enhanced with more sophisticated NLP
|
| 30 |
data = {
|
| 31 |
'name': '',
|
| 32 |
'email': '',
|
|
@@ -39,102 +37,60 @@ def parse_cv_content(text):
|
|
| 39 |
'projects': []
|
| 40 |
}
|
| 41 |
|
| 42 |
-
#
|
| 43 |
-
lines = text.split('\n')
|
| 44 |
if lines:
|
| 45 |
-
data['name'] = lines[0].
|
| 46 |
|
| 47 |
-
#
|
| 48 |
-
email_pattern = r'[
|
| 49 |
emails = re.findall(email_pattern, text)
|
| 50 |
if emails:
|
| 51 |
data['email'] = emails[0]
|
| 52 |
|
| 53 |
-
#
|
| 54 |
phone_pattern = r'(\+?\d[\d\s\-\(\)]{7,}\d)'
|
| 55 |
phones = re.findall(phone_pattern, text)
|
| 56 |
if phones:
|
| 57 |
data['phone'] = phones[0].strip()
|
| 58 |
|
| 59 |
-
#
|
| 60 |
linkedin_pattern = r'(https?://)?(www\.)?linkedin\.com/[a-zA-Z0-9\-\./]+'
|
| 61 |
linkedin = re.search(linkedin_pattern, text)
|
| 62 |
if linkedin:
|
| 63 |
data['linkedin'] = linkedin.group(0)
|
| 64 |
|
| 65 |
-
#
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
#
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
# Simple parsing - can be enhanced
|
| 74 |
-
positions = re.split(r'\n(?=\w.*\d{4})', exp_section) # Split by new line followed by text and year
|
| 75 |
-
for pos in positions:
|
| 76 |
-
if pos.strip():
|
| 77 |
-
# Simple extraction - can be made more robust
|
| 78 |
-
lines = [line.strip() for line in pos.split('\n') if line.strip()]
|
| 79 |
-
if lines:
|
| 80 |
-
# First line typically contains title, company, dates
|
| 81 |
-
title_company = lines[0]
|
| 82 |
-
dates = extract_dates(title_company)
|
| 83 |
-
|
| 84 |
-
# Try to split title and company
|
| 85 |
-
parts = re.split(r' at |, | - | \| ', title_company)
|
| 86 |
-
title = parts[0] if parts else ""
|
| 87 |
-
company = parts[1] if len(parts) > 1 else ""
|
| 88 |
-
|
| 89 |
-
# Description is the rest
|
| 90 |
-
description = "\n".join(lines[1:]) if len(lines) > 1 else ""
|
| 91 |
-
|
| 92 |
-
data['experience'].append({
|
| 93 |
-
'title': title,
|
| 94 |
-
'company': company,
|
| 95 |
-
'start_date': dates[0] if dates else "",
|
| 96 |
-
'end_date': dates[1] if len(dates) > 1 else "",
|
| 97 |
-
'description': description
|
| 98 |
-
})
|
| 99 |
-
|
| 100 |
-
# Extract education
|
| 101 |
-
edu_section = extract_section(text, "Education", "Experience|Projects|Skills")
|
| 102 |
-
if edu_section:
|
| 103 |
-
institutions = re.split(r'\n(?=\w)', edu_section) # Split by new line followed by word
|
| 104 |
-
for inst in institutions:
|
| 105 |
-
if inst.strip():
|
| 106 |
-
lines = [line.strip() for line in inst.split('\n') if line.strip()]
|
| 107 |
-
if lines:
|
| 108 |
-
# First line typically contains degree and institution
|
| 109 |
-
degree_institution = lines[0]
|
| 110 |
-
dates = extract_dates(degree_institution)
|
| 111 |
-
|
| 112 |
-
# Try to split degree and institution
|
| 113 |
-
parts = re.split(r', | at | - | \| ', degree_institution)
|
| 114 |
-
degree = parts[0] if parts else ""
|
| 115 |
-
institution = parts[1] if len(parts) > 1 else ""
|
| 116 |
-
|
| 117 |
-
data['education'].append({
|
| 118 |
-
'degree': degree,
|
| 119 |
-
'institution': institution,
|
| 120 |
-
'start_date': dates[0] if dates else "",
|
| 121 |
-
'end_date': dates[1] if len(dates) > 1 else ""
|
| 122 |
-
})
|
| 123 |
-
|
| 124 |
-
# Extract projects
|
| 125 |
-
proj_section = extract_section(text, "Projects", "Experience|Education|Skills")
|
| 126 |
-
if proj_section:
|
| 127 |
-
projects = re.split(r'\n(?=\w)', proj_section) # Split by new line followed by word
|
| 128 |
-
for proj in projects:
|
| 129 |
-
if proj.strip():
|
| 130 |
-
lines = [line.strip() for line in proj.split('\n') if line.strip()]
|
| 131 |
-
if lines:
|
| 132 |
-
title = lines[0]
|
| 133 |
-
description = "\n".join(lines[1:]) if len(lines) > 1 else ""
|
| 134 |
-
data['projects'].append({
|
| 135 |
-
'title': title,
|
| 136 |
-
'description': description
|
| 137 |
-
})
|
| 138 |
|
| 139 |
return data
|
| 140 |
|
|
|
|
| 25 |
return text
|
| 26 |
|
| 27 |
def parse_cv_content(text):
|
|
|
|
|
|
|
| 28 |
data = {
|
| 29 |
'name': '',
|
| 30 |
'email': '',
|
|
|
|
| 37 |
'projects': []
|
| 38 |
}
|
| 39 |
|
| 40 |
+
# Name extraction (first line with title case)
|
| 41 |
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 42 |
if lines:
|
| 43 |
+
data['name'] = lines[0].title()
|
| 44 |
|
| 45 |
+
# Email extraction
|
| 46 |
+
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
| 47 |
emails = re.findall(email_pattern, text)
|
| 48 |
if emails:
|
| 49 |
data['email'] = emails[0]
|
| 50 |
|
| 51 |
+
# Phone extraction (international format)
|
| 52 |
phone_pattern = r'(\+?\d[\d\s\-\(\)]{7,}\d)'
|
| 53 |
phones = re.findall(phone_pattern, text)
|
| 54 |
if phones:
|
| 55 |
data['phone'] = phones[0].strip()
|
| 56 |
|
| 57 |
+
# LinkedIn URL extraction
|
| 58 |
linkedin_pattern = r'(https?://)?(www\.)?linkedin\.com/[a-zA-Z0-9\-\./]+'
|
| 59 |
linkedin = re.search(linkedin_pattern, text)
|
| 60 |
if linkedin:
|
| 61 |
data['linkedin'] = linkedin.group(0)
|
| 62 |
|
| 63 |
+
# Simple section parsing (improved)
|
| 64 |
+
current_section = None
|
| 65 |
+
for line in lines:
|
| 66 |
+
line_lower = line.lower()
|
| 67 |
+
if 'summary' in line_lower or 'objective' in line_lower:
|
| 68 |
+
current_section = 'summary'
|
| 69 |
+
data[current_section] = line.replace('Summary', '').replace('SUMMARY', '').strip()
|
| 70 |
+
elif 'experience' in line_lower or 'work history' in line_lower:
|
| 71 |
+
current_section = 'experience'
|
| 72 |
+
elif 'education' in line_lower:
|
| 73 |
+
current_section = 'education'
|
| 74 |
+
elif 'skills' in line_lower:
|
| 75 |
+
current_section = 'skills'
|
| 76 |
+
elif 'projects' in line_lower:
|
| 77 |
+
current_section = 'projects'
|
| 78 |
+
elif current_section:
|
| 79 |
+
if current_section == 'summary':
|
| 80 |
+
data[current_section] += ' ' + line
|
| 81 |
+
elif current_section == 'skills' and line.strip():
|
| 82 |
+
data['skills'].extend([s.strip() for s in line.split(',') if s.strip()])
|
| 83 |
+
elif current_section in ['experience', 'education', 'projects'] and line.strip():
|
| 84 |
+
if line.strip() and not line.strip().startswith(('•', '-', '*')):
|
| 85 |
+
# New entry
|
| 86 |
+
data[current_section].append({'title': line, 'description': []})
|
| 87 |
+
elif data[current_section] and line.strip():
|
| 88 |
+
# Continuation of previous entry
|
| 89 |
+
data[current_section][-1]['description'].append(line.strip('•-* '))
|
| 90 |
|
| 91 |
+
# Clean up summary
|
| 92 |
+
if 'summary' in data:
|
| 93 |
+
data['summary'] = ' '.join(data['summary'].split())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
return data
|
| 96 |
|