import re from io import BytesIO from pdfminer.high_level import extract_text as extract_text_from_pdf from docx import Document import magic def extract_text_from_file(uploaded_file): """Extract text from uploaded file (PDF, DOCX, or TXT)""" file_content = uploaded_file.read() mime = magic.Magic(mime=True) file_type = mime.from_buffer(file_content) if file_type == 'application/pdf': with BytesIO(file_content) as pdf_file: text = extract_text_from_pdf(pdf_file) elif file_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': with BytesIO(file_content) as docx_file: doc = Document(docx_file) text = "\n".join([para.text for para in doc.paragraphs]) elif file_type == 'text/plain': text = file_content.decode('utf-8') else: raise ValueError(f"Unsupported file type: {file_type}") return text def parse_cv_content(text): data = { 'name': '', 'email': '', 'phone': '', 'linkedin': '', 'summary': '', 'skills': [], 'experience': [], 'education': [], 'projects': [] } # Name extraction (first line with title case) lines = [line.strip() for line in text.split('\n') if line.strip()] if lines: data['name'] = lines[0].title() # Email extraction email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' emails = re.findall(email_pattern, text) if emails: data['email'] = emails[0] # Phone extraction (international format) phone_pattern = r'(\+?\d[\d\s\-\(\)]{7,}\d)' phones = re.findall(phone_pattern, text) if phones: data['phone'] = phones[0].strip() # LinkedIn URL extraction linkedin_pattern = r'(https?://)?(www\.)?linkedin\.com/[a-zA-Z0-9\-\./]+' linkedin = re.search(linkedin_pattern, text) if linkedin: data['linkedin'] = linkedin.group(0) # Simple section parsing (improved) current_section = None for line in lines: line_lower = line.lower() if 'summary' in line_lower or 'objective' in line_lower: current_section = 'summary' data[current_section] = line.replace('Summary', '').replace('SUMMARY', '').strip() elif 'experience' in line_lower or 'work history' in line_lower: current_section = 'experience' elif 'education' in line_lower: current_section = 'education' elif 'skills' in line_lower: current_section = 'skills' elif 'projects' in line_lower: current_section = 'projects' elif current_section: if current_section == 'summary': data[current_section] += ' ' + line elif current_section == 'skills' and line.strip(): data['skills'].extend([s.strip() for s in line.split(',') if s.strip()]) elif current_section in ['experience', 'education', 'projects'] and line.strip(): if line.strip() and not line.strip().startswith(('•', '-', '*')): # New entry data[current_section].append({'title': line, 'description': []}) elif data[current_section] and line.strip(): # Continuation of previous entry data[current_section][-1]['description'].append(line.strip('•-* ')) # Clean up summary if 'summary' in data: data['summary'] = ' '.join(data['summary'].split()) return data def extract_section(text, start_pattern, end_pattern): """Extract a section between start and end patterns""" start = re.search(start_pattern, text, re.IGNORECASE) if not start: return None remaining_text = text[start.end():] end = re.search(end_pattern, remaining_text, re.IGNORECASE) if end: return remaining_text[:end.start()].strip() return remaining_text.strip() def extract_dates(text): """Extract dates from text (simple pattern matching)""" # Common date patterns patterns = [ r'(\w+\s?\d{4})\s?[-–—]\s?(\w+\s?\d{4}|Present|Current)', # Jan 2020 - Dec 2022 r'(\d{4})\s?[-–—]\s?(\d{4}|Present|Current)', # 2020 - 2022 r'(\w+\s?\d{4})', # Jan 2020 r'(\d{4})' # 2020 ] for pattern in patterns: matches = re.findall(pattern, text) if matches: if isinstance(matches[0], tuple): return [m.strip() for m in matches[0]] return [matches[0].strip()] return []