Spaces:
Sleeping
Sleeping
| import re | |
| from io import BytesIO | |
| from pdfminer.high_level import extract_text as extract_text_from_pdf | |
| from docx import Document | |
| import magic | |
| def extract_text_from_file(uploaded_file): | |
| """Extract text from uploaded file (PDF, DOCX, or TXT)""" | |
| file_content = uploaded_file.read() | |
| mime = magic.Magic(mime=True) | |
| file_type = mime.from_buffer(file_content) | |
| if file_type == 'application/pdf': | |
| with BytesIO(file_content) as pdf_file: | |
| text = extract_text_from_pdf(pdf_file) | |
| elif file_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': | |
| with BytesIO(file_content) as docx_file: | |
| doc = Document(docx_file) | |
| text = "\n".join([para.text for para in doc.paragraphs]) | |
| elif file_type == 'text/plain': | |
| text = file_content.decode('utf-8') | |
| else: | |
| raise ValueError(f"Unsupported file type: {file_type}") | |
| return text | |
| def parse_cv_content(text): | |
| data = { | |
| 'name': '', | |
| 'email': '', | |
| 'phone': '', | |
| 'linkedin': '', | |
| 'summary': '', | |
| 'skills': [], | |
| 'experience': [], | |
| 'education': [], | |
| 'projects': [] | |
| } | |
| # Name extraction (first line with title case) | |
| lines = [line.strip() for line in text.split('\n') if line.strip()] | |
| if lines: | |
| data['name'] = lines[0].title() | |
| # Email extraction | |
| email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
| emails = re.findall(email_pattern, text) | |
| if emails: | |
| data['email'] = emails[0] | |
| # Phone extraction (international format) | |
| phone_pattern = r'(\+?\d[\d\s\-\(\)]{7,}\d)' | |
| phones = re.findall(phone_pattern, text) | |
| if phones: | |
| data['phone'] = phones[0].strip() | |
| # LinkedIn URL extraction | |
| linkedin_pattern = r'(https?://)?(www\.)?linkedin\.com/[a-zA-Z0-9\-\./]+' | |
| linkedin = re.search(linkedin_pattern, text) | |
| if linkedin: | |
| data['linkedin'] = linkedin.group(0) | |
| # Simple section parsing (improved) | |
| current_section = None | |
| for line in lines: | |
| line_lower = line.lower() | |
| if 'summary' in line_lower or 'objective' in line_lower: | |
| current_section = 'summary' | |
| data[current_section] = line.replace('Summary', '').replace('SUMMARY', '').strip() | |
| elif 'experience' in line_lower or 'work history' in line_lower: | |
| current_section = 'experience' | |
| elif 'education' in line_lower: | |
| current_section = 'education' | |
| elif 'skills' in line_lower: | |
| current_section = 'skills' | |
| elif 'projects' in line_lower: | |
| current_section = 'projects' | |
| elif current_section: | |
| if current_section == 'summary': | |
| data[current_section] += ' ' + line | |
| elif current_section == 'skills' and line.strip(): | |
| data['skills'].extend([s.strip() for s in line.split(',') if s.strip()]) | |
| elif current_section in ['experience', 'education', 'projects'] and line.strip(): | |
| if line.strip() and not line.strip().startswith(('β’', '-', '*')): | |
| # New entry | |
| data[current_section].append({'title': line, 'description': []}) | |
| elif data[current_section] and line.strip(): | |
| # Continuation of previous entry | |
| data[current_section][-1]['description'].append(line.strip('β’-* ')) | |
| # Clean up summary | |
| if 'summary' in data: | |
| data['summary'] = ' '.join(data['summary'].split()) | |
| return data | |
| def extract_section(text, start_pattern, end_pattern): | |
| """Extract a section between start and end patterns""" | |
| start = re.search(start_pattern, text, re.IGNORECASE) | |
| if not start: | |
| return None | |
| remaining_text = text[start.end():] | |
| end = re.search(end_pattern, remaining_text, re.IGNORECASE) | |
| if end: | |
| return remaining_text[:end.start()].strip() | |
| return remaining_text.strip() | |
| def extract_dates(text): | |
| """Extract dates from text (simple pattern matching)""" | |
| # Common date patterns | |
| patterns = [ | |
| r'(\w+\s?\d{4})\s?[-ββ]\s?(\w+\s?\d{4}|Present|Current)', # Jan 2020 - Dec 2022 | |
| r'(\d{4})\s?[-ββ]\s?(\d{4}|Present|Current)', # 2020 - 2022 | |
| r'(\w+\s?\d{4})', # Jan 2020 | |
| r'(\d{4})' # 2020 | |
| ] | |
| for pattern in patterns: | |
| matches = re.findall(pattern, text) | |
| if matches: | |
| if isinstance(matches[0], tuple): | |
| return [m.strip() for m in matches[0]] | |
| return [matches[0].strip()] | |
| return [] |