mkhekare commited on
Commit
ece7937
·
verified ·
1 Parent(s): b5949d6

Update utils/parsing_utils.py

Browse files
Files changed (1) hide show
  1. utils/parsing_utils.py +37 -81
utils/parsing_utils.py CHANGED
@@ -25,8 +25,6 @@ def extract_text_from_file(uploaded_file):
25
  return text
26
 
27
  def parse_cv_content(text):
28
- """Parse extracted text into structured CV data"""
29
- # Basic parsing - this can be enhanced with more sophisticated NLP
30
  data = {
31
  'name': '',
32
  'email': '',
@@ -39,102 +37,60 @@ def parse_cv_content(text):
39
  'projects': []
40
  }
41
 
42
- # Extract name (first line is often the name)
43
- lines = text.split('\n')
44
  if lines:
45
- data['name'] = lines[0].strip()
46
 
47
- # Extract email
48
- email_pattern = r'[\w\.-]+@[\w\.-]+'
49
  emails = re.findall(email_pattern, text)
50
  if emails:
51
  data['email'] = emails[0]
52
 
53
- # Extract phone (simple international pattern)
54
  phone_pattern = r'(\+?\d[\d\s\-\(\)]{7,}\d)'
55
  phones = re.findall(phone_pattern, text)
56
  if phones:
57
  data['phone'] = phones[0].strip()
58
 
59
- # Extract LinkedIn URL
60
  linkedin_pattern = r'(https?://)?(www\.)?linkedin\.com/[a-zA-Z0-9\-\./]+'
61
  linkedin = re.search(linkedin_pattern, text)
62
  if linkedin:
63
  data['linkedin'] = linkedin.group(0)
64
 
65
- # Extract skills (looking for "Skills" section)
66
- skills_section = extract_section(text, "Skills", "Experience|Education|Projects")
67
- if skills_section:
68
- data['skills'] = [s.strip() for s in re.split(r'[,;•\-]', skills_section) if s.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- # Extract experience (simplified)
71
- exp_section = extract_section(text, "Experience|Work History", "Education|Projects|Skills")
72
- if exp_section:
73
- # Simple parsing - can be enhanced
74
- positions = re.split(r'\n(?=\w.*\d{4})', exp_section) # Split by new line followed by text and year
75
- for pos in positions:
76
- if pos.strip():
77
- # Simple extraction - can be made more robust
78
- lines = [line.strip() for line in pos.split('\n') if line.strip()]
79
- if lines:
80
- # First line typically contains title, company, dates
81
- title_company = lines[0]
82
- dates = extract_dates(title_company)
83
-
84
- # Try to split title and company
85
- parts = re.split(r' at |, | - | \| ', title_company)
86
- title = parts[0] if parts else ""
87
- company = parts[1] if len(parts) > 1 else ""
88
-
89
- # Description is the rest
90
- description = "\n".join(lines[1:]) if len(lines) > 1 else ""
91
-
92
- data['experience'].append({
93
- 'title': title,
94
- 'company': company,
95
- 'start_date': dates[0] if dates else "",
96
- 'end_date': dates[1] if len(dates) > 1 else "",
97
- 'description': description
98
- })
99
-
100
- # Extract education
101
- edu_section = extract_section(text, "Education", "Experience|Projects|Skills")
102
- if edu_section:
103
- institutions = re.split(r'\n(?=\w)', edu_section) # Split by new line followed by word
104
- for inst in institutions:
105
- if inst.strip():
106
- lines = [line.strip() for line in inst.split('\n') if line.strip()]
107
- if lines:
108
- # First line typically contains degree and institution
109
- degree_institution = lines[0]
110
- dates = extract_dates(degree_institution)
111
-
112
- # Try to split degree and institution
113
- parts = re.split(r', | at | - | \| ', degree_institution)
114
- degree = parts[0] if parts else ""
115
- institution = parts[1] if len(parts) > 1 else ""
116
-
117
- data['education'].append({
118
- 'degree': degree,
119
- 'institution': institution,
120
- 'start_date': dates[0] if dates else "",
121
- 'end_date': dates[1] if len(dates) > 1 else ""
122
- })
123
-
124
- # Extract projects
125
- proj_section = extract_section(text, "Projects", "Experience|Education|Skills")
126
- if proj_section:
127
- projects = re.split(r'\n(?=\w)', proj_section) # Split by new line followed by word
128
- for proj in projects:
129
- if proj.strip():
130
- lines = [line.strip() for line in proj.split('\n') if line.strip()]
131
- if lines:
132
- title = lines[0]
133
- description = "\n".join(lines[1:]) if len(lines) > 1 else ""
134
- data['projects'].append({
135
- 'title': title,
136
- 'description': description
137
- })
138
 
139
  return data
140
 
 
25
  return text
26
 
27
  def parse_cv_content(text):
 
 
28
  data = {
29
  'name': '',
30
  'email': '',
 
37
  'projects': []
38
  }
39
 
40
+ # Name extraction (first line with title case)
41
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
42
  if lines:
43
+ data['name'] = lines[0].title()
44
 
45
+ # Email extraction
46
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
47
  emails = re.findall(email_pattern, text)
48
  if emails:
49
  data['email'] = emails[0]
50
 
51
+ # Phone extraction (international format)
52
  phone_pattern = r'(\+?\d[\d\s\-\(\)]{7,}\d)'
53
  phones = re.findall(phone_pattern, text)
54
  if phones:
55
  data['phone'] = phones[0].strip()
56
 
57
+ # LinkedIn URL extraction
58
  linkedin_pattern = r'(https?://)?(www\.)?linkedin\.com/[a-zA-Z0-9\-\./]+'
59
  linkedin = re.search(linkedin_pattern, text)
60
  if linkedin:
61
  data['linkedin'] = linkedin.group(0)
62
 
63
+ # Simple section parsing (improved)
64
+ current_section = None
65
+ for line in lines:
66
+ line_lower = line.lower()
67
+ if 'summary' in line_lower or 'objective' in line_lower:
68
+ current_section = 'summary'
69
+ data[current_section] = line.replace('Summary', '').replace('SUMMARY', '').strip()
70
+ elif 'experience' in line_lower or 'work history' in line_lower:
71
+ current_section = 'experience'
72
+ elif 'education' in line_lower:
73
+ current_section = 'education'
74
+ elif 'skills' in line_lower:
75
+ current_section = 'skills'
76
+ elif 'projects' in line_lower:
77
+ current_section = 'projects'
78
+ elif current_section:
79
+ if current_section == 'summary':
80
+ data[current_section] += ' ' + line
81
+ elif current_section == 'skills' and line.strip():
82
+ data['skills'].extend([s.strip() for s in line.split(',') if s.strip()])
83
+ elif current_section in ['experience', 'education', 'projects'] and line.strip():
84
+ if line.strip() and not line.strip().startswith(('•', '-', '*')):
85
+ # New entry
86
+ data[current_section].append({'title': line, 'description': []})
87
+ elif data[current_section] and line.strip():
88
+ # Continuation of previous entry
89
+ data[current_section][-1]['description'].append(line.strip('•-* '))
90
 
91
+ # Clean up summary
92
+ if 'summary' in data:
93
+ data['summary'] = ' '.join(data['summary'].split())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  return data
96