Spaces:

mkhekare
/

CVBuilder-ATS-Friendly

Sleeping

App Files Files Community

mkhekare commited on Jul 25

Commit

ece7937

verified ·

1 Parent(s): b5949d6

Update utils/parsing_utils.py

Browse files

Files changed (1) hide show

utils/parsing_utils.py +37 -81

utils/parsing_utils.py CHANGED Viewed

@@ -25,8 +25,6 @@ def extract_text_from_file(uploaded_file):
     return text
 def parse_cv_content(text):
-    """Parse extracted text into structured CV data"""
-    # Basic parsing - this can be enhanced with more sophisticated NLP
     data = {
         'name': '',
         'email': '',
@@ -39,102 +37,60 @@ def parse_cv_content(text):
         'projects': []
     }
-    # Extract name (first line is often the name)
-    lines = text.split('\n')
     if lines:
-        data['name'] = lines[0].strip()
-    # Extract email
-    email_pattern = r'[\w\.-]+@[\w\.-]+'
     emails = re.findall(email_pattern, text)
     if emails:
         data['email'] = emails[0]
-    # Extract phone (simple international pattern)
     phone_pattern = r'(\+?\d[\d\s\-\(\)]{7,}\d)'
     phones = re.findall(phone_pattern, text)
     if phones:
         data['phone'] = phones[0].strip()
-    # Extract LinkedIn URL
     linkedin_pattern = r'(https?://)?(www\.)?linkedin\.com/[a-zA-Z0-9\-\./]+'
     linkedin = re.search(linkedin_pattern, text)
     if linkedin:
         data['linkedin'] = linkedin.group(0)
-    # Extract skills (looking for "Skills" section)
-    skills_section = extract_section(text, "Skills", "Experience|Education|Projects")
-    if skills_section:
-        data['skills'] = [s.strip() for s in re.split(r'[,;•\-]', skills_section) if s.strip()]
-    # Extract experience (simplified)
-    exp_section = extract_section(text, "Experience|Work History", "Education|Projects|Skills")
-    if exp_section:
-        # Simple parsing - can be enhanced
-        positions = re.split(r'\n(?=\w.*\d{4})', exp_section)  # Split by new line followed by text and year
-        for pos in positions:
-            if pos.strip():
-                # Simple extraction - can be made more robust
-                lines = [line.strip() for line in pos.split('\n') if line.strip()]
-                if lines:
-                    # First line typically contains title, company, dates
-                    title_company = lines[0]
-                    dates = extract_dates(title_company)
-                    # Try to split title and company
-                    parts = re.split(r' at |, | - | \| ', title_company)
-                    title = parts[0] if parts else ""
-                    company = parts[1] if len(parts) > 1 else ""
-                    # Description is the rest
-                    description = "\n".join(lines[1:]) if len(lines) > 1 else ""
-                    data['experience'].append({
-                        'title': title,
-                        'company': company,
-                        'start_date': dates[0] if dates else "",
-                        'end_date': dates[1] if len(dates) > 1 else "",
-                        'description': description
-                    })
-    # Extract education
-    edu_section = extract_section(text, "Education", "Experience|Projects|Skills")
-    if edu_section:
-        institutions = re.split(r'\n(?=\w)', edu_section)  # Split by new line followed by word
-        for inst in institutions:
-            if inst.strip():
-                lines = [line.strip() for line in inst.split('\n') if line.strip()]
-                if lines:
-                    # First line typically contains degree and institution
-                    degree_institution = lines[0]
-                    dates = extract_dates(degree_institution)
-                    # Try to split degree and institution
-                    parts = re.split(r', | at | - | \| ', degree_institution)
-                    degree = parts[0] if parts else ""
-                    institution = parts[1] if len(parts) > 1 else ""
-                    data['education'].append({
-                        'degree': degree,
-                        'institution': institution,
-                        'start_date': dates[0] if dates else "",
-                        'end_date': dates[1] if len(dates) > 1 else ""
-                    })
-    # Extract projects
-    proj_section = extract_section(text, "Projects", "Experience|Education|Skills")
-    if proj_section:
-        projects = re.split(r'\n(?=\w)', proj_section)  # Split by new line followed by word
-        for proj in projects:
-            if proj.strip():
-                lines = [line.strip() for line in proj.split('\n') if line.strip()]
-                if lines:
-                    title = lines[0]
-                    description = "\n".join(lines[1:]) if len(lines) > 1 else ""
-                    data['projects'].append({
-                        'title': title,
-                        'description': description
-                    })
     return data

     return text
 def parse_cv_content(text):
     data = {
         'name': '',
         'email': '',
         'projects': []
     }
+    # Name extraction (first line with title case)
+    lines = [line.strip() for line in text.split('\n') if line.strip()]
     if lines:
+        data['name'] = lines[0].title()
+    # Email extraction
+    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
     emails = re.findall(email_pattern, text)
     if emails:
         data['email'] = emails[0]
+    # Phone extraction (international format)
     phone_pattern = r'(\+?\d[\d\s\-\(\)]{7,}\d)'
     phones = re.findall(phone_pattern, text)
     if phones:
         data['phone'] = phones[0].strip()
+    # LinkedIn URL extraction
     linkedin_pattern = r'(https?://)?(www\.)?linkedin\.com/[a-zA-Z0-9\-\./]+'
     linkedin = re.search(linkedin_pattern, text)
     if linkedin:
         data['linkedin'] = linkedin.group(0)
+    # Simple section parsing (improved)
+    current_section = None
+    for line in lines:
+        line_lower = line.lower()
+        if 'summary' in line_lower or 'objective' in line_lower:
+            current_section = 'summary'
+            data[current_section] = line.replace('Summary', '').replace('SUMMARY', '').strip()
+        elif 'experience' in line_lower or 'work history' in line_lower:
+            current_section = 'experience'
+        elif 'education' in line_lower:
+            current_section = 'education'
+        elif 'skills' in line_lower:
+            current_section = 'skills'
+        elif 'projects' in line_lower:
+            current_section = 'projects'
+        elif current_section:
+            if current_section == 'summary':
+                data[current_section] += ' ' + line
+            elif current_section == 'skills' and line.strip():
+                data['skills'].extend([s.strip() for s in line.split(',') if s.strip()])
+            elif current_section in ['experience', 'education', 'projects'] and line.strip():
+                if line.strip() and not line.strip().startswith(('•', '-', '*')):
+                    # New entry
+                    data[current_section].append({'title': line, 'description': []})
+                elif data[current_section] and line.strip():
+                    # Continuation of previous entry
+                    data[current_section][-1]['description'].append(line.strip('•-* '))
+    # Clean up summary
+    if 'summary' in data:
+        data['summary'] = ' '.join(data['summary'].split())
     return data