import re
from io import BytesIO
from pdfminer.high_level import extract_text as extract_text_from_pdf
from docx import Document
import magic

def extract_text_from_file(uploaded_file):
    """Extract text from uploaded file (PDF, DOCX, or TXT)"""
    file_content = uploaded_file.read()
    mime = magic.Magic(mime=True)
    file_type = mime.from_buffer(file_content)
    
    if file_type == 'application/pdf':
        with BytesIO(file_content) as pdf_file:
            text = extract_text_from_pdf(pdf_file)
    elif file_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
        with BytesIO(file_content) as docx_file:
            doc = Document(docx_file)
            text = "\n".join([para.text for para in doc.paragraphs])
    elif file_type == 'text/plain':
        text = file_content.decode('utf-8')
    else:
        raise ValueError(f"Unsupported file type: {file_type}")
    
    return text

def parse_cv_content(text):
    data = {
        'name': '',
        'email': '',
        'phone': '',
        'linkedin': '',
        'summary': '',
        'skills': [],
        'experience': [],
        'education': [],
        'projects': []
    }
    
    # Name extraction (first line with title case)
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    if lines:
        data['name'] = lines[0].title()
    
    # Email extraction
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(email_pattern, text)
    if emails:
        data['email'] = emails[0]
    
    # Phone extraction (international format)
    phone_pattern = r'(\+?\d[\d\s\-\(\)]{7,}\d)'
    phones = re.findall(phone_pattern, text)
    if phones:
        data['phone'] = phones[0].strip()
    
    # LinkedIn URL extraction
    linkedin_pattern = r'(https?://)?(www\.)?linkedin\.com/[a-zA-Z0-9\-\./]+'
    linkedin = re.search(linkedin_pattern, text)
    if linkedin:
        data['linkedin'] = linkedin.group(0)
    
    # Simple section parsing (improved)
    current_section = None
    for line in lines:
        line_lower = line.lower()
        if 'summary' in line_lower or 'objective' in line_lower:
            current_section = 'summary'
            data[current_section] = line.replace('Summary', '').replace('SUMMARY', '').strip()
        elif 'experience' in line_lower or 'work history' in line_lower:
            current_section = 'experience'
        elif 'education' in line_lower:
            current_section = 'education'
        elif 'skills' in line_lower:
            current_section = 'skills'
        elif 'projects' in line_lower:
            current_section = 'projects'
        elif current_section:
            if current_section == 'summary':
                data[current_section] += ' ' + line
            elif current_section == 'skills' and line.strip():
                data['skills'].extend([s.strip() for s in line.split(',') if s.strip()])
            elif current_section in ['experience', 'education', 'projects'] and line.strip():
                if line.strip() and not line.strip().startswith(('•', '-', '*')):
                    # New entry
                    data[current_section].append({'title': line, 'description': []})
                elif data[current_section] and line.strip():
                    # Continuation of previous entry
                    data[current_section][-1]['description'].append(line.strip('•-* '))
    
    # Clean up summary
    if 'summary' in data:
        data['summary'] = ' '.join(data['summary'].split())
    
    return data

def extract_section(text, start_pattern, end_pattern):
    """Extract a section between start and end patterns"""
    start = re.search(start_pattern, text, re.IGNORECASE)
    if not start:
        return None
    
    remaining_text = text[start.end():]
    end = re.search(end_pattern, remaining_text, re.IGNORECASE)
    
    if end:
        return remaining_text[:end.start()].strip()
    return remaining_text.strip()

def extract_dates(text):
    """Extract dates from text (simple pattern matching)"""
    # Common date patterns
    patterns = [
        r'(\w+\s?\d{4})\s?[-–—]\s?(\w+\s?\d{4}|Present|Current)',  # Jan 2020 - Dec 2022
        r'(\d{4})\s?[-–—]\s?(\d{4}|Present|Current)',              # 2020 - 2022
        r'(\w+\s?\d{4})',                                         # Jan 2020
        r'(\d{4})'                                               # 2020
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text)
        if matches:
            if isinstance(matches[0], tuple):
                return [m.strip() for m in matches[0]]
            return [matches[0].strip()]
    
    return []